Skip to content

Commit a270ee0

Browse files
committed
Read operation throughput
1 parent 22c9ae0 commit a270ee0

File tree

1 file changed

+11
-6
lines changed

1 file changed

+11
-6
lines changed

tools/rccl-prim-test/rccl_prim_test.cpp

+11-6
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ enum Ops {
9292
OP_DOUBLECOPY,
9393
OP_REDUCE,
9494
OP_REDUCECOPY,
95+
OP_READ,
9596
NUM_OPS,
9697
};
9798

@@ -123,7 +124,9 @@ __global__ void flag_sync_kernel(struct transfer_data_t* transfer_data, struct p
123124
if (op == OP_DOUBLECOPY) DoubleCopy<DOUBLECOPY_UNROLL, THREADS, float>(transfer_data->dest0[bid], transfer_data->dest1[bid], transfer_data->src0[bid], n);
124125
if (op == OP_REDUCE) Reduce<REDUCE_UNROLL, THREADS, float>(transfer_data->dest0[bid], transfer_data->src0[bid], transfer_data->src1[bid], n);
125126
if (op == OP_REDUCECOPY) ReduceCopy<REDUCECOPY_UNROLL, THREADS, float>(transfer_data->dest0[bid], transfer_data->dest1[bid], transfer_data->src0[bid], transfer_data->src1[bid], n);
126-
127+
// Swapped the dest0 and src0 in passed parameter of copy kernel so that it can utilized for as a read kernel.
128+
// fetch op will happen on transfer_data->dest0[bid] and store op will happen on transfer_data->src0[bid]
129+
if (op == OP_READ) Copy<COPY_UNROLL, THREADS, float>(transfer_data->src0[bid],transfer_data->dest0[bid], n);
127130
__syncthreads();
128131
if (idx == 0) {
129132
next_time = clock64();
@@ -145,6 +148,8 @@ static flag_sync_kernel_t const flagSyncKerns[NUM_OPS*2] = {
145148
flag_sync_kernel<OP_REDUCE, 1>,
146149
flag_sync_kernel<OP_REDUCECOPY, 0>,
147150
flag_sync_kernel<OP_REDUCECOPY, 1>,
151+
flag_sync_kernel<OP_READ, 0>,
152+
flag_sync_kernel<OP_READ, 1>,
148153
};
149154

150155
__global__ void initTestDataKernel(float* data, const size_t N, const int gpu) {
@@ -294,9 +299,9 @@ int main(int argc,char* argv[])
294299
sync = atol(s);
295300
if (sync) printf("Sync all GPUs before operation\n");
296301

297-
const char *ops[] = {"copy", "localcopy", "doublecopy", "reduce", "reducecopy", "all"};
302+
const char *ops[] = {"copy", "localcopy", "doublecopy", "reduce", "reducecopy", "read", "all"};
298303
char *prim = getCmdOption(argv, argv + argc, "-p");
299-
int op = 5, begin_op, end_op;
304+
int op = NUM_OPS, begin_op, end_op;
300305
if (prim) {
301306
for (op = 0; op < sizeof(ops); op++)
302307
if (!strcmp((const char *)prim, ops[op]))
@@ -315,8 +320,8 @@ int main(int argc,char* argv[])
315320
// Enable peer access
316321
setupPeers(connection_info);
317322
// clockwise and counter clockwise rings
318-
int ring_0[MAX_GPU] = {-1, -1, -1, -1};
319-
int ring_1[MAX_GPU] = {-1, -1, -1, -1};
323+
int ring_0[MAX_GPU] = {-1, -1, -1, -1,-1, -1, -1, -1};
324+
int ring_1[MAX_GPU] = {-1, -1, -1, -1,-1, -1, -1, -1};
320325
setupRings(connection_info, ring_0, ring_1);
321326

322327
// data buffers
@@ -392,7 +397,7 @@ int main(int argc,char* argv[])
392397

393398
uint64_t opCount = 0;
394399
for (int op = begin_op; op < end_op; op ++) {
395-
const char *OpsName[] = {"Copy", "Local Copy", "Double Copy", "Reduce", "ReduceCopy"};
400+
const char *OpsName[] = {"Copy", "Local Copy", "Double Copy", "Reduce", "ReduceCopy","read"};
396401
printf("\n[Testing %s]: \n", OpsName[op]);
397402
// 4 warm up cycles
398403
for (int i = 0; i < 4; i ++) {

0 commit comments

Comments
 (0)