1
- /* SM 3/3.5 Variant for lyra2REv2 */
1
+ /* SM 2/ 3/3.5 Variant for lyra2REv2 */
2
2
3
3
#ifdef __INTELLISENSE__
4
4
/* just for vstudio code colors */
5
5
#undef __CUDA_ARCH__
6
6
#define __CUDA_ARCH__ 350
7
7
#endif
8
8
9
- #define TPB30 16
9
+ #define TPB20 64
10
+ #define TPB30 64
10
11
#define TPB35 64
11
12
12
- #if __CUDA_ARCH__ >= 300 && __CUDA_ARCH__ < 500
13
+ #if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ < 500
13
14
14
15
#include " cuda_lyra2_vectors.h"
15
16
@@ -165,6 +166,7 @@ void reduceDuplexRowtV3(const int rowIn, const int rowInOut, const int rowOut, v
165
166
}
166
167
}
167
168
169
+ #if __CUDA_ARCH__ >= 300
168
170
__global__ __launch_bounds__ (TPB35, 1 )
169
171
void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
170
172
{
@@ -177,14 +179,14 @@ void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outpu
177
179
if (threadIdx .x == 0 ) {
178
180
179
181
((uint16*)blake2b_IV)[0 ] = make_uint16 (
180
- 0xf3bcc908 , 0x6a09e667 , 0x84caa73b , 0xbb67ae85 ,
181
- 0xfe94f82b , 0x3c6ef372 , 0x5f1d36f1 , 0xa54ff53a ,
182
- 0xade682d1 , 0x510e527f , 0x2b3e6c1f , 0x9b05688c ,
182
+ 0xf3bcc908 , 0x6a09e667 , 0x84caa73b , 0xbb67ae85 ,
183
+ 0xfe94f82b , 0x3c6ef372 , 0x5f1d36f1 , 0xa54ff53a ,
184
+ 0xade682d1 , 0x510e527f , 0x2b3e6c1f , 0x9b05688c ,
183
185
0xfb41bd6b , 0x1f83d9ab , 0x137e2179 , 0x5be0cd19
184
186
);
185
187
((uint16*)padding)[0 ] = make_uint16 (
186
- 0x20 , 0x0 , 0x20 , 0x0 , 0x20 , 0x0 , 0x01 , 0x0 ,
187
- 0x04 , 0x0 , 0x04 , 0x0 , 0x80 , 0x0 , 0x0 , 0x01000000
188
+ 0x20 , 0x0 , 0x20 , 0x0 , 0x20 , 0x0 , 0x01 , 0x0 ,
189
+ 0x04 , 0x0 , 0x04 , 0x0 , 0x80 , 0x0 , 0x0 , 0x01000000
188
190
);
189
191
}
190
192
@@ -194,6 +196,7 @@ void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outpu
194
196
((uint2 *)state)[1 ] = __ldg (&outputHash[thread + threads]);
195
197
((uint2 *)state)[2 ] = __ldg (&outputHash[thread + 2 * threads]);
196
198
((uint2 *)state)[3 ] = __ldg (&outputHash[thread + 3 * threads]);
199
+
197
200
state[1 ] = state[0 ];
198
201
state[2 ] = shuffle4 (((vectype*)blake2b_IV)[0 ], 0 );
199
202
state[3 ] = shuffle4 (((vectype*)blake2b_IV)[1 ], 0 );
@@ -246,9 +249,90 @@ void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outpu
246
249
247
250
} // thread
248
251
}
252
+ #elif __CUDA_ARCH__ >= 200
253
+ __global__ __launch_bounds__ (TPB20, 1 )
254
+ void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
255
+ {
256
+ uint32_t thread = (blockDim .x * blockIdx .x + threadIdx .x );
257
+
258
+ vectype state[4 ];
259
+ vectype blake2b_IV[2 ];
260
+ vectype padding[2 ];
261
+
262
+ ((uint16*)blake2b_IV)[0 ] = make_uint16 (
263
+ 0xf3bcc908 , 0x6a09e667 , 0x84caa73b , 0xbb67ae85 ,
264
+ 0xfe94f82b , 0x3c6ef372 , 0x5f1d36f1 , 0xa54ff53a ,
265
+ 0xade682d1 , 0x510e527f , 0x2b3e6c1f , 0x9b05688c ,
266
+ 0xfb41bd6b , 0x1f83d9ab , 0x137e2179 , 0x5be0cd19
267
+ );
268
+ ((uint16*)padding)[0 ] = make_uint16 (
269
+ 0x20 , 0x0 , 0x20 , 0x0 , 0x20 , 0x0 , 0x01 , 0x0 ,
270
+ 0x04 , 0x0 , 0x04 , 0x0 , 0x80 , 0x0 , 0x0 , 0x01000000
271
+ );
272
+
273
+ if (thread < threads)
274
+ {
275
+
276
+ ((uint2 *)state)[0 ] = outputHash[thread];
277
+ ((uint2 *)state)[1 ] = outputHash[thread + threads];
278
+ ((uint2 *)state)[2 ] = outputHash[thread + 2 * threads];
279
+ ((uint2 *)state)[3 ] = outputHash[thread + 3 * threads];
280
+
281
+ state[1 ] = state[0 ];
282
+ state[2 ] = ((vectype*)blake2b_IV)[0 ];
283
+ state[3 ] = ((vectype*)blake2b_IV)[1 ];
284
+
285
+ for (int i = 0 ; i<12 ; i++)
286
+ round_lyra_v35 (state);
287
+
288
+ state[0 ] ^= ((vectype*)padding)[0 ];
289
+ state[1 ] ^= ((vectype*)padding)[1 ];
290
+
291
+ for (int i = 0 ; i<12 ; i++)
292
+ round_lyra_v35 (state);
293
+
294
+ uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread);
295
+
296
+ // #pragma unroll 4
297
+ for (int i = 0 ; i < 4 ; i++)
298
+ {
299
+ uint32_t s1 = ps1 - 4 * memshift * i;
300
+ for (int j = 0 ; j < 3 ; j++)
301
+ (DMatrix + s1)[j] = (state)[j];
302
+
303
+ round_lyra_v35 (state);
304
+ }
305
+
306
+ reduceDuplexV3 (state, thread);
307
+ reduceDuplexRowSetupV3 (1 , 0 , 2 , state, thread);
308
+ reduceDuplexRowSetupV3 (2 , 1 , 3 , state, thread);
309
+
310
+ uint32_t rowa;
311
+ int prev = 3 ;
312
+ for (int i = 0 ; i < 4 ; i++)
313
+ {
314
+ rowa = ((uint2 *)state)[0 ].x & 3 ; reduceDuplexRowtV3 (prev, rowa, i, state, thread);
315
+ prev = i;
316
+ }
317
+
318
+ uint32_t shift = (memshift * rowa + 16 * memshift * thread);
319
+
320
+ for (int j = 0 ; j < 3 ; j++)
321
+ state[j] ^= __ldg4 (&(DMatrix + shift)[j]);
322
+
323
+ for (int i = 0 ; i < 12 ; i++)
324
+ round_lyra_v35 (state);
325
+
326
+ outputHash[thread] = ((uint2 *)state)[0 ];
327
+ outputHash[thread + threads] = ((uint2 *)state)[1 ];
328
+ outputHash[thread + 2 * threads] = ((uint2 *)state)[2 ];
329
+ outputHash[thread + 3 * threads] = ((uint2 *)state)[3 ];
330
+
331
+ } // thread
332
+ }
333
+ #endif
249
334
250
335
#else
251
- /* if __CUDA_ARCH__ < 300 .. */
336
+ /* host & sm5+ */
252
337
__global__ void lyra2v2_gpu_hash_32_v3 (uint32_t threads, uint32_t startNounce, uint2 *outputHash) {}
253
338
#endif
254
-
0 commit comments