@@ -92,6 +92,7 @@ enum Ops {
92
92
OP_DOUBLECOPY,
93
93
OP_REDUCE,
94
94
OP_REDUCECOPY,
95
+ OP_READ,
95
96
NUM_OPS,
96
97
};
97
98
@@ -123,7 +124,9 @@ __global__ void flag_sync_kernel(struct transfer_data_t* transfer_data, struct p
123
124
if (op == OP_DOUBLECOPY) DoubleCopy<DOUBLECOPY_UNROLL, THREADS, float >(transfer_data->dest0 [bid], transfer_data->dest1 [bid], transfer_data->src0 [bid], n);
124
125
if (op == OP_REDUCE) Reduce<REDUCE_UNROLL, THREADS, float >(transfer_data->dest0 [bid], transfer_data->src0 [bid], transfer_data->src1 [bid], n);
125
126
if (op == OP_REDUCECOPY) ReduceCopy<REDUCECOPY_UNROLL, THREADS, float >(transfer_data->dest0 [bid], transfer_data->dest1 [bid], transfer_data->src0 [bid], transfer_data->src1 [bid], n);
126
-
127
+ // Swapped the dest0 and src0 in passed parameter of copy kernel so that it can utilized for as a read kernel.
128
+ // fetch op will happen on transfer_data->dest0[bid] and store op will happen on transfer_data->src0[bid]
129
+ if (op == OP_READ) Copy<COPY_UNROLL, THREADS, float >(transfer_data->src0 [bid],transfer_data->dest0 [bid], n);
127
130
__syncthreads ();
128
131
if (idx == 0 ) {
129
132
next_time = clock64 ();
@@ -145,6 +148,8 @@ static flag_sync_kernel_t const flagSyncKerns[NUM_OPS*2] = {
145
148
flag_sync_kernel<OP_REDUCE, 1 >,
146
149
flag_sync_kernel<OP_REDUCECOPY, 0 >,
147
150
flag_sync_kernel<OP_REDUCECOPY, 1 >,
151
+ flag_sync_kernel<OP_READ, 0 >,
152
+ flag_sync_kernel<OP_READ, 1 >,
148
153
};
149
154
150
155
__global__ void initTestDataKernel (float * data, const size_t N, const int gpu) {
@@ -294,9 +299,9 @@ int main(int argc,char* argv[])
294
299
sync = atol (s);
295
300
if (sync ) printf (" Sync all GPUs before operation\n " );
296
301
297
- const char *ops[] = {" copy" , " localcopy" , " doublecopy" , " reduce" , " reducecopy" , " all" };
302
+ const char *ops[] = {" copy" , " localcopy" , " doublecopy" , " reduce" , " reducecopy" , " read " , " all" };
298
303
char *prim = getCmdOption (argv, argv + argc, " -p" );
299
- int op = 5 , begin_op, end_op;
304
+ int op = NUM_OPS , begin_op, end_op;
300
305
if (prim) {
301
306
for (op = 0 ; op < sizeof (ops); op++)
302
307
if (!strcmp ((const char *)prim, ops[op]))
@@ -315,8 +320,8 @@ int main(int argc,char* argv[])
315
320
// Enable peer access
316
321
setupPeers (connection_info);
317
322
// clockwise and counter clockwise rings
318
- int ring_0[MAX_GPU] = {-1 , -1 , -1 , -1 };
319
- int ring_1[MAX_GPU] = {-1 , -1 , -1 , -1 };
323
+ int ring_0[MAX_GPU] = {-1 , -1 , -1 , -1 ,- 1 , - 1 , - 1 , - 1 };
324
+ int ring_1[MAX_GPU] = {-1 , -1 , -1 , -1 ,- 1 , - 1 , - 1 , - 1 };
320
325
setupRings (connection_info, ring_0, ring_1);
321
326
322
327
// data buffers
@@ -392,7 +397,7 @@ int main(int argc,char* argv[])
392
397
393
398
uint64_t opCount = 0 ;
394
399
for (int op = begin_op; op < end_op; op ++) {
395
- const char *OpsName[] = {" Copy" , " Local Copy" , " Double Copy" , " Reduce" , " ReduceCopy" };
400
+ const char *OpsName[] = {" Copy" , " Local Copy" , " Double Copy" , " Reduce" , " ReduceCopy" , " read " };
396
401
printf (" \n [Testing %s]: \n " , OpsName[op]);
397
402
// 4 warm up cycles
398
403
for (int i = 0 ; i < 4 ; i ++) {
0 commit comments