Skip to content

Commit e1e2899

Browse files
committed
Update Handson to use userbuffer
1 parent b4d0847 commit e1e2899

File tree

6 files changed

+98
-9
lines changed

6 files changed

+98
-9
lines changed

08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ The purpose of this task is to use NCCL instead of MPI to implement a multi-GPU
2121
- Fix output message to indicate nccl rather than mpi
2222
- Destroy NCCL communicator
2323

24+
If you have time left:
25+
- Use ncclMemAlloc to allocate the buffers and register them for communication
26+
- Don`t forget to deregister and free the buffers correctly
27+
2428
Compile with
2529

2630
``` {.bash}

08-H_NCCL_NVSHMEM/solutions/NCCL/Makefile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved.
2-
THIS_TASK := 08H-NCCL-sol
2+
THIS_TASK := 08H-NCCL-task
33
OUTPUT_NAME := jacobi.$(THIS_TASK)__$(shell date '+%Y%m%d-%H%M')
44
NP ?= 1
55
NVCC=nvcc
@@ -42,5 +42,8 @@ sanitize: jacobi
4242
run: jacobi
4343
$(JSC_SUBMIT_CMD) -n $(NP) ./jacobi
4444

45+
run_user_buffer: jacobi
46+
$(JSC_SUBMIT_CMD) -n $(NP) ./jacobi -user_buffer_reg
47+
4548
profile: jacobi
4649
$(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o $(OUTPUT_NAME).%q{SLURM_PROCID} ./jacobi -niter 10

08-H_NCCL_NVSHMEM/solutions/NCCL/jacobi.cpp

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,12 @@ const int num_colors = sizeof(colors) / sizeof(uint32_t);
9393

9494
//TODO: include NCCL headers
9595
#include <nccl.h>
96+
#ifdef NCCL_VERSION
97+
#define NCCL_VERSION_UB NCCL_VERSION(2,19,1)
98+
#define NCCL_UB_SUPPORT NCCL_VERSION_CODE >= NCCL_VERSION_UB
99+
#else
100+
#define NCCL_UB_SUPPORT 0
101+
#endif
96102

97103
#define NCCL_CALL(call) \
98104
{ \
@@ -168,7 +174,13 @@ int main(int argc, char* argv[]) {
168174
const int nx = get_argval<int>(argv, argv + argc, "-nx", 16384);
169175
const int ny = get_argval<int>(argv, argv + argc, "-ny", 16384);
170176
const bool csv = get_arg(argv, argv + argc, "-csv");
171-
177+
bool user_buffer_reg = get_arg(argv, argv + argc, "-user_buffer_reg");
178+
#if NCCL_UB_SUPPORT == 0
179+
if (user_buffer_reg) {
180+
fprintf(stderr,"WARNING: Ignoring -user_buffer_reg, required NCCL APIs are provided by NCCL 2.19.1 or later.\n");
181+
user_buffer_reg = false;
182+
}
183+
#endif //NCCL_UB_SUPPORT == 0
172184
int local_rank = -1;
173185
{
174186
MPI_Comm local_comm;
@@ -220,10 +232,27 @@ int main(int argc, char* argv[]) {
220232
chunk_size = chunk_size_high;
221233

222234
real* a;
223-
CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real)));
224235
real* a_new;
225-
CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(real)));
226236

237+
#if NCCL_UB_SUPPORT
238+
void* a_reg_handle;
239+
void* a_new_reg_handle;
240+
if (user_buffer_reg) {
241+
//TODO: Allocate the memory with ncclMemAlloc and register it for the commmunicatior
242+
NCCL_CALL(ncclMemAlloc( (void**) &a , nx * (chunk_size + 2) * sizeof(real)));
243+
NCCL_CALL(ncclMemAlloc( (void**) &a_new, nx * (chunk_size + 2) * sizeof(real)));
244+
NCCL_CALL(ncclCommRegister(nccl_comm, a , nx * (chunk_size + 2) * sizeof(real), &a_reg_handle));
245+
NCCL_CALL(ncclCommRegister(nccl_comm, a_new, nx * (chunk_size + 2) * sizeof(real), &a_new_reg_handle));
246+
if ( nccl_version < 22304 ) {
247+
fprintf(stderr,"WARNING: -user_buffer_reg available, but Jacobi communication pattern needs NCCL 2.23.4 or later.\n");
248+
}
249+
}
250+
else
251+
#endif //NCCL_UB_SUPPORT
252+
{
253+
CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real)));
254+
CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(real)));
255+
}
227256
CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(real)));
228257
CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(real)));
229258

@@ -403,10 +432,20 @@ int main(int argc, char* argv[]) {
403432

404433
CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
405434
CUDA_RT_CALL(cudaFree(l2_norm_d));
406-
435+
#if NCCL_UB_SUPPORT
436+
if (user_buffer_reg) {
437+
//TODO: Deregister and Free the Buffer
438+
NCCL_CALL(ncclCommDeregister(nccl_comm, a_new_reg_handle));
439+
NCCL_CALL(ncclCommDeregister(nccl_comm, a_reg_handle));
440+
NCCL_CALL(ncclMemFree(a_new));
441+
NCCL_CALL(ncclMemFree(a));
442+
}
443+
else
444+
#endif //NCCL_UB_SUPPORT
445+
{
407446
CUDA_RT_CALL(cudaFree(a_new));
408447
CUDA_RT_CALL(cudaFree(a));
409-
448+
}
410449
CUDA_RT_CALL(cudaFreeHost(a_h));
411450
CUDA_RT_CALL(cudaFreeHost(a_ref_h));
412451

08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ The purpose of this task is to use NCCL instead of MPI to implement a multi-GPU
2121
- Fix output message to indicate nccl rather than mpi
2222
- Destroy NCCL communicator
2323

24+
If you have time left:
25+
- Use ncclMemAlloc to allocate the buffers and register them for communication
26+
- Don`t forget to deregister and free the buffers correctly
27+
2428
Compile with
2529

2630
``` {.bash}

08-H_NCCL_NVSHMEM/tasks/NCCL/Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,5 +42,8 @@ sanitize: jacobi
4242
run: jacobi
4343
$(JSC_SUBMIT_CMD) -n $(NP) ./jacobi
4444

45+
run_user_buffer: jacobi
46+
$(JSC_SUBMIT_CMD) -n $(NP) ./jacobi -user_buffer_reg
47+
4548
profile: jacobi
4649
$(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o $(OUTPUT_NAME).%q{SLURM_PROCID} ./jacobi -niter 10

08-H_NCCL_NVSHMEM/tasks/NCCL/jacobi.cpp

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include <sstream>
3232

3333
#include <mpi.h>
34+
3435
#define MPI_CALL(call) \
3536
{ \
3637
int mpi_status = call; \
@@ -93,6 +94,13 @@ const int num_colors = sizeof(colors) / sizeof(uint32_t);
9394

9495
//TODO: include NCCL headers
9596

97+
#ifdef NCCL_VERSION
98+
#define NCCL_VERSION_UB NCCL_VERSION(2,19,1)
99+
#define NCCL_UB_SUPPORT NCCL_VERSION_CODE >= NCCL_VERSION_UB
100+
#else
101+
#define NCCL_UB_SUPPORT 0
102+
#endif
103+
96104
#define NCCL_CALL(call) \
97105
{ \
98106
ncclResult_t ncclStatus = call; \
@@ -164,7 +172,14 @@ int main(int argc, char* argv[]) {
164172
const int nx = get_argval<int>(argv, argv + argc, "-nx", 16384);
165173
const int ny = get_argval<int>(argv, argv + argc, "-ny", 16384);
166174
const bool csv = get_arg(argv, argv + argc, "-csv");
175+
bool user_buffer_reg = get_arg(argv, argv + argc, "-user_buffer_reg");
167176

177+
#if NCCL_UB_SUPPORT == 0
178+
if (user_buffer_reg) {
179+
fprintf(stderr,"WARNING: Ignoring -user_buffer_reg, required NCCL APIs are provided by NCCL 2.19.1 or later.\n");
180+
user_buffer_reg = false;
181+
}
182+
#endif //NCCL_UB_SUPPORT == 0
168183
int local_rank = -1;
169184
{
170185
MPI_Comm local_comm;
@@ -183,10 +198,25 @@ int main(int argc, char* argv[]) {
183198

184199
//TODO: Create a communicator (ncclComm_t), initialize it (ncclCommInitRank)
185200

201+
186202
real* a_ref_h;
187-
CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(real)));
188203
real* a_h;
204+
#if NCCL_UB_SUPPORT
205+
void* a_reg_handle;
206+
void* a_new_reg_handle;
207+
if (user_buffer_reg) {
208+
//TODO: Allocate the memory with ncclMemAlloc and register it for the commmunicatior
209+
210+
if ( nccl_version < 22304 ) {
211+
fprintf(stderr,"WARNING: -user_buffer_reg available, but Jacobi communication pattern needs NCCL 2.23.4 or later.\n");
212+
}
213+
}
214+
else
215+
#endif //NCCL_UB_SUPPORT
216+
{
217+
CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(real)));
189218
CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(real)));
219+
}
190220
double runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, nccheck, !csv && (0 == rank));
191221

192222
// ny - 2 rows are distributed amongst `size` ranks in such a way
@@ -386,10 +416,16 @@ int main(int argc, char* argv[]) {
386416

387417
CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
388418
CUDA_RT_CALL(cudaFree(l2_norm_d));
389-
419+
#if NCCL_UB_SUPPORT
420+
if (user_buffer_reg) {
421+
//TODO: Deregister and Free the Buffer
422+
}
423+
else
424+
#endif //NCCL_UB_SUPPORT
425+
{
390426
CUDA_RT_CALL(cudaFree(a_new));
391427
CUDA_RT_CALL(cudaFree(a));
392-
428+
}
393429
CUDA_RT_CALL(cudaFreeHost(a_h));
394430
CUDA_RT_CALL(cudaFreeHost(a_ref_h));
395431

0 commit comments

Comments
 (0)