#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
#define HASH_SIZE 32
__constant uint IV[8] = {
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
__constant ulong BLAKE2B_IV[8] = {
0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
0x510e527fade682d1, 0x9b05688c2b3e6c1f,
0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
};
__constant uchar sigma[10][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15 },
{14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3 },
{11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8 },
{ 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13 },
{ 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9 },
{12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11 },
{13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10 },
{ 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5 },
{10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13, 0 }
};
typedef struct sols_s {
uint nr;
uint likely_invalids;
uint values[2000][512];
uint valid[2000];
} sols_t;
uint rotr32(uint x, uint n) {
return (x >> n) | (x << (32 - n));
}
ulong rotr64(ulong x, ulong n) {
return (x >> n) | (x << (64 - n));
}
void blake2s_core(__global const uchar* input, uint len, __global uchar* out) {
uint m[16] = {0};
for (int i = 0; i < 16 && (i * 4 + 3) < len; ++i) {
m[i] = input[i4 + 0] | (input[i4 + 1] << 8) | (input[i4 + 2] << 16) | (input[i4 + 3] << 24);
}
uint v[16];
for (int i = 0; i < 8; ++i) {
v[i] = IV[i];
v[i + 8] = IV[i];
}
v[12] ^= len;
for (int r = 0; r < 10; ++r) {
const __constant uchar* s = sigma[r];
#define G(a,b,c,d,x,y) \
a += b + x; \
d = rotr32(d ^ a, 16); \
c += d; \
b = rotr32(b ^ c, 12); \
a += b + y; \
d = rotr32(d ^ a, 8); \
c += d; \
b = rotr32(b ^ c, 7);
G(v[0],v[4],v[8],v[12], m[s[0]], m[s[1]]);
G(v[1],v[5],v[9],v[13], m[s[2]], m[s[3]]);
G(v[2],v[6],v[10],v[14], m[s[4]], m[s[5]]);
G(v[3],v[7],v[11],v[15], m[s[6]], m[s[7]]);
G(v[0],v[5],v[10],v[15], m[s[8]], m[s[9]]);
G(v[1],v[6],v[11],v[12], m[s[10]], m[s[11]]);
G(v[2],v[7],v[8],v[13], m[s[12]], m[s[13]]);
G(v[3],v[4],v[9],v[14], m[s[14]], m[s[15]]);
#undef G
}
for (int i = 0; i < 8; ++i) {
uint h = v[i] ^ v[i + 8];
out[i*4 + 0] = h & 0xFF;
out[i*4 + 1] = (h >> 8) & 0xFF;
out[i*4 + 2] = (h >> 16) & 0xFF;
out[i*4 + 3] = (h >> 24) & 0xFF;
}
}
__kernel void kernel_init_ht(__global char *ht)
{
uint tid = get_global_id(0);
*(__global uint *)(ht + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32) = 0;
}
uint ht_store(uint round, __global char *ht, uint i, ulong xi0, ulong xi1, ulong xi2, ulong xi3)
{
uint row;
__global char *p;
uint cnt;
if (!(round % 2))
row = (xi0 & 0xffff) | ((xi0 & 0xf00000) >> 4);
else
row = ((xi0 & 0xf0000) >> 0) | ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
xi0 = (xi0 >> 16) | (xi1 << (64 - 16));
xi1 = (xi1 >> 16) | (xi2 << (64 - 16));
xi2 = (xi2 >> 16) | (xi3 << (64 - 16));
p = ht + row * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32;
cnt = atomic_inc((__global uint *)p);
if (cnt >= ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9))
return 1;
p += cnt * 32 + (8 + ((round) / 2) * 4);
*(__global uint *)(p - 4) = i;
if (round == 0 || round == 1) {
*(__global ulong *)(p + 0) = xi0;
*(__global ulong *)(p + 8) = xi1;
*(__global ulong *)(p + 16) = xi2;
} else if (round == 2) {
*(__global ulong *)(p + 0) = xi0;
*(__global ulong *)(p + 8) = xi1;
*(__global uint *)(p + 16) = xi2;
} else if (round == 3 || round == 4) {
*(__global ulong *)(p + 0) = xi0;
*(__global ulong *)(p + 8) = xi1;
} else if (round == 5) {
*(__global ulong *)(p + 0) = xi0;
*(__global uint *)(p + 8) = xi1;
} else if (round == 6 || round == 7) {
*(__global ulong *)(p + 0) = xi0;
} else if (round == 8) {
*(__global uint *)(p + 0) = xi0;
}
return 0;
}
__kernel attribute((reqd_work_group_size(64, 1, 1)))
void kernel_round0(__global ulong *blake_state, __global char *ht, __global uint *debug)
{
uint tid = get_global_id(0);
ulong v[16];
uint inputs_per_thread = (1 << (200 / (9 + 1))) / get_global_size(0);
uint input = tid * inputs_per_thread;
uint input_end = (tid + 1) * inputs_per_thread;
uint dropped = 0;
while (input < input_end)
{
ulong word1 = (ulong)input << 32;
v[0] = blake_state[0];
v[1] = blake_state[1];
v[2] = blake_state[2];
v[3] = blake_state[3];
v[4] = blake_state[4];
v[5] = blake_state[5];
v[6] = blake_state[6];
v[7] = blake_state[7];
v[8] = BLAKE2B_IV[0];
v[9] = BLAKE2B_IV[1];
v[10] = BLAKE2B_IV[2];
v[11] = BLAKE2B_IV[3];
v[12] = BLAKE2B_IV[4];
v[13] = BLAKE2B_IV[5];
v[14] = BLAKE2B_IV[6];
v[15] = BLAKE2B_IV[7];
v[12] ^= 140 + 4;
v[14] ^= -1;
// RUNDE 1
v[0] = (v[0] + v[4] + 0); v[12] = rotr64((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]);
v[4] = rotr64((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1);
v[12] = rotr64((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]);
v[4] = rotr64((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotr64((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]);
v[5] = rotr64((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0);
v[13] = rotr64((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]);
v[5] = rotr64((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotr64((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]);
v[6] = rotr64((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0);
v[14] = rotr64((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]);
v[6] = rotr64((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotr64((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]);
v[7] = rotr64((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0);
v[15] = rotr64((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]);
v[7] = rotr64((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotr64((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]);
v[5] = rotr64((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0);
v[15] = rotr64((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]);
v[5] = rotr64((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotr64((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]);
v[6] = rotr64((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0);
v[12] = rotr64((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]);
v[6] = rotr64((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotr64((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]);
v[7] = rotr64((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0);
v[13] = rotr64((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]);
v[7] = rotr64((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotr64((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]);
v[4] = rotr64((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0);
v[14] = rotr64((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]);
v[4] = rotr64((v[4] ^ v[9]), (ulong)64 - 63);;
// RUNDE 2
v[0] = (v[0] + v[4] + 0); v[12] = rotr64((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]);
v[4] = rotr64((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0);
v[12] = rotr64((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]);
v[4] = rotr64((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotr64((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]);
v[5] = rotr64((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0);
v[13] = rotr64((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]);
v[5] = rotr64((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotr64((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]);
v[6] = rotr64((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0);
v[14] = rotr64((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]);
v[6] = rotr64((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotr64((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]);
v[7] = rotr64((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0);
v[15] = rotr64((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]);
v[7] = rotr64((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + word1); v[15] = rotr64((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]);
v[5] = rotr64((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0);
v[15] = rotr64((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]);
v[5] = rotr64((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotr64((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]);
v[6] = rotr64((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0);
v[12] = rotr64((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]);
v[6] = rotr64((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotr64((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]);
v[7] = rotr64((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0);
v[13] = rotr64((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]);
v[7] = rotr64((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotr64((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]);
v[4] = rotr64((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0);
v[14] = rotr64((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]);
v[4] = rotr64((v[4] ^ v[9]), (ulong)64 - 63);;
// RUNDE 3
v[0] = (v[0] + v[4] + 0); v[12] = rotr64((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]);
v[4] = rotr64((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0);
v[12] = rotr64((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]);
v[4] = rotr64((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotr64((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]);
v[5] = rotr64((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0);
v[13] = rotr64((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]);
v[5] = rotr64((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotr64((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]);
v[6] = rotr64((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0);
v[14] = rotr64((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]);
v[6] = rotr64((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotr64((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]);
v[7] = rotr64((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0);
v[15] = rotr64((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]);
v[7] = rotr64((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotr64((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]);
v[5] = rotr64((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0);
v[15] = rotr64((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]);
v[5] = rotr64((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotr64((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]);
v[6] = rotr64((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0);
v[12] = rotr64((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]);
v[6] = rotr64((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotr64((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]);
v[7] = rotr64((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + word1);
v[13] = rotr64((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]);
v[7] = rotr64((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotr64((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]);
v[4] = rotr64((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0);
v[14] = rotr64((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]);
v[4] = rotr64((v[4] ^ v[9]), (ulong)64 - 63);;
// RUNDE 4
v[0] = (v[0] + v[4] + 0); v[12] = rotr64((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]);
v[4] = rotr64((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0);
v[12] = rotr64((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]);
v[4] = rotr64((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotr64((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]);
v[5] = rotr64((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + word1);
v[13] = rotr64((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]);
v[5] = rotr64((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotr64((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]);
v[6] = rotr64((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0);
v[14] = rotr64((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]);
v[6] = rotr64((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotr64((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]);
v[7] = rotr64((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0);
v[15] = rotr64((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]);
v[7] = rotr64((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotr64((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]);
v[5] = rotr64((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0);
v[15] = rotr64((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]);
v[5] = rotr64((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotr64((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]);
v[6] = rotr64((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0);
v[12] = rotr64((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]);
v[6] = rotr64((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotr64((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]);
v[7] = rotr64((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0);
v[13] = rotr64((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]);
v[7] = rotr64((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotr64((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]);
v[4] = rotr64((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0);
v[14] = rotr64((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]);
v[4] = rotr64((v[4] ^ v[9]), (ulong)64 - 63);;
// RUNDE 5
v[0] = (v[0] + v[4] + 0); v[12] = rotr64((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]);
v[4] = rotr64((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0);
v[12] = rotr64((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]);
v[4] = rotr64((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotr64((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]);
v[5] = rotr64((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0);
v[13] = rotr64((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]);
v[5] = rotr64((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotr64((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]);
v[6] = rotr64((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + word1);
v[14] = rotr64((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]);
v[6] = rotr64((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotr64((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]);
v[7] = rotr64((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0);
v[15] = rotr64((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]);
v[7] = rotr64((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotr64((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]);
v[5] = rotr64((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0);
v[15] = rotr64((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]);
v[5] = rotr64((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotr64((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]);
v[6] = rotr64((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0);
v[12] = rotr64((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]);
v[6] = rotr64((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotr64((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]);
v[7] = rotr64((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0);
v[13] = rotr64((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]);
v[7] = rotr64((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotr64((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]);
v[4] = rotr64((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0);
v[14] = rotr64((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]);
v[4] = rotr64((v[4] ^ v[9]), (ulong)64 - 63);;
// RUNDE 6
v[0] = (v[0] + v[4] + 0); v[12] = rotr64((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]);
v[4] = rotr64((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0);
v[12] = rotr64((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]);
v[4] = rotr64((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotr64((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]);
v[5] = rotr64((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0);
v[13] = rotr64((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]);
v[5] = rotr64((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotr64((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]);
v[6] = rotr64((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0);
v[14] = rotr64((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]);
v[6] = rotr64((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotr64((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]);
v[7] = rotr64((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0);
v[15] = rotr64((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]);
v[7] = rotr64((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotr64((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]);
v[5] = rotr64((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0);
v[15] = rotr64((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]);
v[5] = rotr64((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotr64((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]);
v[6] = rotr64((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0);
v[12] = rotr64((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]);
v[6] = rotr64((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + word1); v[13] = rotr64((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]);
v[7] = rotr64((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0);
v[13] = rotr64((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]);
v[7] = rotr64((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotr64((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]);
v[4] = rotr64((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0);
v[14] = rotr64((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]);
v[4] = rotr64((v[4] ^ v[9]), (ulong)64 - 63);;
// RUNDE 7
v[0] = (v[0] + v[4] + 0); v[12] = rotr64((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]);
v[4] = rotr64((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0);
v[12] = rotr64((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]);
v[4] = rotr64((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotr64((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]);
v[5] = rotr64((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0);
v[13] = rotr64((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]);
v[5] = rotr64((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotr64((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]);
v[6] = rotr64((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0);
v[14] = rotr64((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]);
v[6] = rotr64((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + word1); v[15] = rotr64((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]);
v[7] = rotr64((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0);
v[15] = rotr64((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]);
v[7] = rotr64((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotr64((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]);
v[5] = rotr64((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0);
v[15] = rotr64((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]);
v[5] = rotr64((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotr64((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]);
v[6] = rotr64((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0);
v[12] = rotr64((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]);
v[6] = rotr64((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotr64((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]);
v[7] = rotr64((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0);
v[13] = rotr64((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]);
v[7] = rotr64((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotr64((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]);
v[4] = rotr64((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0);
v[14] = rotr64((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]);
v[4] = rotr64((v[4] ^ v[9]), (ulong)64 - 63);;
// RUNDE 8
v[0] = (v[0] + v[4] + 0); v[12] = rotr64((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]);
v[4] = rotr64((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1);
v[12] = rotr64((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]);
v[4] = rotr64((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotr64((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]);
v[5] = rotr64((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0);
v[13] = rotr64((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]);
v[5] = rotr64((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotr64((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]);
v[6] = rotr64((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0);
v[14] = rotr64((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]);
v[6] = rotr64((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotr64((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]);
v[7] = rotr64((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0);
v[15] = rotr64((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]);
v[7] = rotr64((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + 0); v[15] = rotr64((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]);
v[5] = rotr64((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0);
v[15] = rotr64((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]);
v[5] = rotr64((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotr64((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]);
v[6] = rotr64((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0);
v[12] = rotr64((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]);
v[6] = rotr64((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotr64((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]);
v[7] = rotr64((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0);
v[13] = rotr64((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]);
v[7] = rotr64((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotr64((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]);
v[4] = rotr64((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0);
v[14] = rotr64((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]);
v[4] = rotr64((v[4] ^ v[9]), (ulong)64 - 63);;
// RUNDE 9
v[0] = (v[0] + v[4] + 0); v[12] = rotr64((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]);
v[4] = rotr64((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0);
v[12] = rotr64((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]);
v[4] = rotr64((v[4] ^ v[8]), (ulong)64 - 63);;
v[1] = (v[1] + v[5] + 0); v[13] = rotr64((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]);
v[5] = rotr64((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0);
v[13] = rotr64((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]);
v[5] = rotr64((v[5] ^ v[9]), (ulong)64 - 63);;
v[2] = (v[2] + v[6] + 0); v[14] = rotr64((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]);
v[6] = rotr64((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0);
v[14] = rotr64((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]);
v[6] = rotr64((v[6] ^ v[10]), (ulong)64 - 63);;
v[3] = (v[3] + v[7] + 0); v[15] = rotr64((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]);
v[7] = rotr64((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0);
v[15] = rotr64((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]);
v[7] = rotr64((v[7] ^ v[11]), (ulong)64 - 63);;
v[0] = (v[0] + v[5] + word1); v[15] = rotr64((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]);
v[5] = rotr64((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0);
v[15] = rotr64((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]);
v[5] = rotr64((v[5] ^ v[10]), (ulong)64 - 63);;
v[1] = (v[1] + v[6] + 0); v[12] = rotr64((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]);
v[6] = rotr64((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0);
v[12] = rotr64((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]);
v[6] = rotr64((v[6] ^ v[11]), (ulong)64 - 63);;
v[2] = (v[2] + v[7] + 0); v[13] = rotr64((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]);
v[7] = rotr64((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0);
v[13] = rotr64((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]);
v[7] = rotr64((v[7] ^ v[8]), (ulong)64 - 63);;
v[3] = (v[3] + v[4] + 0); v[14] = rotr64((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]);
v[4] = rotr64((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0);
v[14] = rotr64((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]);
v[4] = rotr64((v[4] ^ v[9]), (ulong)64 - 63);;
ulong h[7];
h[0] = blake_state[0] ^ v[0] ^ v[8];
h[1] = blake_state[1] ^ v[1] ^ v[9];
h[2] = blake_state[2] ^ v[2] ^ v[10];
h[3] = blake_state[3] ^ v[3] ^ v[11];
h[4] = blake_state[4] ^ v[4] ^ v[12];
h[5] = blake_state[5] ^ v[5] ^ v[13];
h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff;
dropped += ht_store(0, ht, input * 2, h[0], h[1], h[2], h[3]);
dropped += ht_store(0, ht, input * 2 + 1, (h[3] >> 8) | (h[4] << (64 - 8)), (h[4] >> 8) | (h[5] << (64 - 8)), (h[5] >> 8) | (h[6] << (64 - 8)), (h[6] >> 8));
input++;
}
}
uint xor_and_store(uint round, __global char *ht_dst, uint row, uint slot_a, uint slot_b, __global ulong *a, __global ulong *b)
{
ulong xi0, xi1, xi2;
if (round == 1 || round == 2) {
xi0 = *(a) ^ *(b);
xi1 = *(a) ^ *(b);
xi2 = *a ^ *b;
if (round == 2) {
xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
xi1 = (xi1 >> 8) | (xi2 << (64 - 8));
xi2 = (xi2 >> 8);
}
} else if (round == 3) {
xi0 = *a++ ^ *b;
xi1 = *a ^ *b;
xi2 = *(__global uint *)a ^ *(__global uint *)b;
} else if (round == 4 || round == 5) {
xi0 = *a ^ *b;
xi1 = *a ^ *b;
xi2 = 0;
if (round == 4) {
xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
xi1 = (xi1 >> 8);
}
} else if (round == 6) {
xi0 = *a ^ *b++;
xi1 = *(__global uint *)a ^ *(__global uint *)b;
xi2 = 0;
if (round == 6) {
xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
xi1 = (xi1 >> 8);
}
} else if (round == 7 || round == 8) {
xi0 = *a ^ *b;
xi1 = 0;
xi2 = 0;
if (round == 8) {
xi0 = (xi0 >> 8);
}
}
if (!xi0 && !xi1)
return 0;
return ht_store(round, ht_dst, ((row << 12) | ((slot_b & 0x3f) << 6) | (slot_a & 0x3f)), xi0, xi1, xi2, 0);
}
void equihash_round(uint round, __global char *ht_src, __global char *ht_dst, __global uint *debug)
{
uint tid = get_global_id(0);
__global char *p;
uint cnt;
uchar first_words[((1 << (((200 / (9 + 1)) + 1) - 20)) * 9)];
uchar mask;
uint i, j;
ushort collisions[((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 3];
uint nr_coll = 0;
uint n;
uint dropped_coll, dropped_stor;
__global ulong *a, *b;
uint xi_offset;
xi_offset = (8 + ((round - 1) / 2) * 4);
mask = 0;
p = (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32);
cnt = *(__global uint *)p;
cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 20)) * 9));
p += xi_offset;
for (i = 0; i < cnt; i++, p += 32)
first_words[i] = *(__global uchar *)p;
nr_coll = 0;
dropped_coll = 0;
for (i = 0; i < cnt; i++)
for (j = i + 1; j < cnt; j++)
if ((first_words[i] & mask) == (first_words[j] & mask)) {
if (nr_coll >= sizeof (collisions) / sizeof (*collisions))
dropped_coll++;
else
collisions[nr_coll++] = ((ushort)j << 8) | ((ushort)i & 0xff);
}
dropped_stor = 0;
for (n = 0; n < nr_coll; n++) {
i = collisions[n] & 0xff;
j = collisions[n] >> 8;
a = (__global ulong *)(ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32 + i * 32 + xi_offset);
b = (__global ulong *)(ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32 + j * 32 + xi_offset);
dropped_stor += xor_and_store(round, ht_dst, tid, i, j, a, b);
}
if (round < 8)
*(__global uint *)(ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32) = 0;
}
// Equihash Round Kernels
__kernel attribute((reqd_work_group_size(64, 1, 1))) void kernel_round1(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(1, ht_src, ht_dst, debug); }
__kernel attribute((reqd_work_group_size(64, 1, 1))) void kernel_round2(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(2, ht_src, ht_dst, debug); }
__kernel attribute((reqd_work_group_size(64, 1, 1))) void kernel_round3(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(3, ht_src, ht_dst, debug); }
__kernel attribute((reqd_work_group_size(64, 1, 1))) void kernel_round4(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(4, ht_src, ht_dst, debug); }
__kernel attribute((reqd_work_group_size(64, 1, 1))) void kernel_round5(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(5, ht_src, ht_dst, debug); }
__kernel attribute((reqd_work_group_size(64, 1, 1))) void kernel_round6(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(6, ht_src, ht_dst, debug); }
__kernel attribute((reqd_work_group_size(64, 1, 1))) void kernel_round7(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(7, ht_src, ht_dst, debug); }
__kernel attribute((reqd_work_group_size(64, 1, 1)))
void kernel_round8(__global char *ht_src, __global char *ht_dst, __global uint *debug, __global sols_t *sols)
{
uint tid = get_global_id(0);
equihash_round(8, ht_src, ht_dst, debug);
if (!tid)
sols->nr = sols->likely_invalids = 0;
}
uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot)
{
return *(__global uint *)(ht + row * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32 + slot * 32 + xi_offset - 4);
}
void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs, uint round)
{
__global char *ht = htabs[round % 2];
uint i = nr_inputs - 1;
uint j = nr_inputs * 2 - 1;
uint xi_offset = (8 + ((round) / 2) * 4);
do {
ins[j] = expand_ref(ht, xi_offset, (ins[i] >> 12), ((ins[i] >> 6) & 0x3f));
ins[j - 1] = expand_ref(ht, xi_offset, (ins[i] >> 12), (ins[i] & 0x3f));
if (!i)
break ;
i--;
j -= 2;
} while (1);
}
void potential_sol(__global char **htabs, __global sols_t *sols, uint ref0, uint ref1)
{
uint sol_i;
uint nr_values;
sol_i = atomic_inc(&sols->nr);
if (sol_i >= 2000)
return ;
sols->valid[sol_i] = 0;
nr_values = 0;
sols->values[sol_i][nr_values] = ref0;
sols->values[sol_i][nr_values] = ref1;
uint round = 9 - 1;
do {
round--;
expand_refs(&(sols->values[sol_i][0]), nr_values, htabs, round);
nr_values *= 2;
} while (round > 0);
sols->valid[sol_i] = 1;
}
__kernel
void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols)
{
uint tid = get_global_id(0);
__global char *htabs[2] = { ht0, ht1 };
uint ht_i = (9 - 1) % 2;
uint cnt;
uint xi_offset = (8 + ((9 - 1) / 2) * 4);
uint i, j;
__global char *a, *b;
uint ref_i, ref_j;
ulong collisions[5];
uint coll;
uint mask = 0xffffff;
a = htabs[ht_i] + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32;
cnt = *(__global uint *)a;
cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 20)) * 9));
coll = 0;
a += xi_offset;
for (i = 0; i < cnt; i++, a += 32) {
for (j = i + 1, b = a + 32; j < cnt; j++, b += 32) {
uint val_a = (*(__global uint *)a) & mask;
uint val_b = (*(__global uint *)b) & mask;
if (val_a == val_b) {
if (coll < sizeof (collisions) / sizeof (*collisions)) {
collisions[coll++] = ((ulong)i << 32) | j;
} else {
atomic_inc(&sols->likely_invalids);
}
}
}
}
if (!coll)
return ;
for (i = 0; i < coll; i++)
potential_sol(htabs, sols, collisions[i] >> 32, collisions[i] & 0xffffffff);
}