Update Handson to use userbuffer

LenaO · LenaO · commit e1e28992f60c · 2024-11-17T17:53:00.000+01:00
diff --git a/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.md b/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.md
@@ -21,6 +21,10 @@ The purpose of this task is to use NCCL instead of MPI to implement a multi-GPU
 - Fix output message to indicate nccl rather than mpi
 - Destroy NCCL communicator
 
+If you have time left:
+- Use ncclMemAlloc to allocate the buffers and register them for communication
+- Don`t forget to deregister and free the buffers correctly
+
 Compile with
 
 ``` {.bash}
diff --git a/08-H_NCCL_NVSHMEM/solutions/NCCL/Makefile b/08-H_NCCL_NVSHMEM/solutions/NCCL/Makefile
@@ -1,5 +1,5 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved.
-THIS_TASK := 08H-NCCL-sol
+THIS_TASK := 08H-NCCL-task
 OUTPUT_NAME := jacobi.$(THIS_TASK)__$(shell date '+%Y%m%d-%H%M')
 NP ?= 1
 NVCC=nvcc
@@ -42,5 +42,8 @@ sanitize: jacobi
 run: jacobi
 	$(JSC_SUBMIT_CMD) -n $(NP) ./jacobi
 
+run_user_buffer: jacobi
+	$(JSC_SUBMIT_CMD) -n $(NP) ./jacobi  -user_buffer_reg
+
 profile: jacobi
 	$(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o $(OUTPUT_NAME).%q{SLURM_PROCID} ./jacobi -niter 10
diff --git a/08-H_NCCL_NVSHMEM/solutions/NCCL/jacobi.cpp b/08-H_NCCL_NVSHMEM/solutions/NCCL/jacobi.cpp
@@ -93,6 +93,12 @@ const int num_colors = sizeof(colors) / sizeof(uint32_t);
 
 //TODO: include NCCL headers
 #include <nccl.h>
+#ifdef NCCL_VERSION
+#define NCCL_VERSION_UB NCCL_VERSION(2,19,1)
+#define NCCL_UB_SUPPORT NCCL_VERSION_CODE >= NCCL_VERSION_UB
+#else
+#define NCCL_UB_SUPPORT 0
+#endif
 
 #define NCCL_CALL(call)                                                                     \
     {                                                                                       \
@@ -168,7 +174,13 @@ int main(int argc, char* argv[]) {
     const int nx = get_argval<int>(argv, argv + argc, "-nx", 16384);
     const int ny = get_argval<int>(argv, argv + argc, "-ny", 16384);
     const bool csv = get_arg(argv, argv + argc, "-csv");
-
+    bool user_buffer_reg = get_arg(argv, argv + argc, "-user_buffer_reg");
+#if NCCL_UB_SUPPORT == 0
+    if (user_buffer_reg) {
+        fprintf(stderr,"WARNING: Ignoring -user_buffer_reg, required NCCL APIs are provided by NCCL 2.19.1 or later.\n");
+        user_buffer_reg = false;
+    }
+#endif //NCCL_UB_SUPPORT == 0
     int local_rank = -1;
     {
         MPI_Comm local_comm;
@@ -220,10 +232,27 @@ int main(int argc, char* argv[]) {
         chunk_size = chunk_size_high;
 
     real* a;
-    CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real)));
     real* a_new;
-    CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(real)));
 
+#if NCCL_UB_SUPPORT
+    void* a_reg_handle;
+    void* a_new_reg_handle;
+    if (user_buffer_reg) {
+    //TODO: Allocate the memory with ncclMemAlloc and register it for the commmunicatior
+        NCCL_CALL(ncclMemAlloc( (void**) &a    , nx * (chunk_size + 2) * sizeof(real)));
+        NCCL_CALL(ncclMemAlloc( (void**) &a_new, nx * (chunk_size + 2) * sizeof(real)));
+        NCCL_CALL(ncclCommRegister(nccl_comm, a    , nx * (chunk_size + 2) * sizeof(real), &a_reg_handle));
+        NCCL_CALL(ncclCommRegister(nccl_comm, a_new, nx * (chunk_size + 2) * sizeof(real), &a_new_reg_handle));
+        if ( nccl_version < 22304 ) {
+            fprintf(stderr,"WARNING: -user_buffer_reg available, but Jacobi communication pattern needs NCCL 2.23.4 or later.\n");
+        }
+    }
+    else
+#endif //NCCL_UB_SUPPORT
+    {
+    CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real)));
+    CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(real)));
+    }
     CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(real)));
     CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(real)));
 
@@ -403,10 +432,20 @@ int main(int argc, char* argv[]) {
 
     CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
     CUDA_RT_CALL(cudaFree(l2_norm_d));
-
+#if NCCL_UB_SUPPORT
+    if (user_buffer_reg) {
+//TODO: Deregister and Free the Buffer
+        NCCL_CALL(ncclCommDeregister(nccl_comm, a_new_reg_handle));
+        NCCL_CALL(ncclCommDeregister(nccl_comm, a_reg_handle));
+        NCCL_CALL(ncclMemFree(a_new));
+        NCCL_CALL(ncclMemFree(a));
+    }
+    else
+#endif //NCCL_UB_SUPPORT
+    {
     CUDA_RT_CALL(cudaFree(a_new));
     CUDA_RT_CALL(cudaFree(a));
-
+    }
     CUDA_RT_CALL(cudaFreeHost(a_h));
     CUDA_RT_CALL(cudaFreeHost(a_ref_h));
 
diff --git a/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.md b/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.md
@@ -21,6 +21,10 @@ The purpose of this task is to use NCCL instead of MPI to implement a multi-GPU
 - Fix output message to indicate nccl rather than mpi
 - Destroy NCCL communicator
 
+If you have time left:
+- Use ncclMemAlloc to allocate the buffers and register them for communication
+- Don`t forget to deregister and free the buffers correctly
+
 Compile with
 
 ``` {.bash}
diff --git a/08-H_NCCL_NVSHMEM/tasks/NCCL/Makefile b/08-H_NCCL_NVSHMEM/tasks/NCCL/Makefile
@@ -42,5 +42,8 @@ sanitize: jacobi
 run: jacobi
 	$(JSC_SUBMIT_CMD) -n $(NP) ./jacobi
 
+run_user_buffer: jacobi
+	$(JSC_SUBMIT_CMD) -n $(NP) ./jacobi  -user_buffer_reg
+
 profile: jacobi
 	$(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o $(OUTPUT_NAME).%q{SLURM_PROCID} ./jacobi -niter 10
diff --git a/08-H_NCCL_NVSHMEM/tasks/NCCL/jacobi.cpp b/08-H_NCCL_NVSHMEM/tasks/NCCL/jacobi.cpp
@@ -31,6 +31,7 @@
 #include <sstream>
 
 #include <mpi.h>
+
 #define MPI_CALL(call)                                                                \
     {                                                                                 \
         int mpi_status = call;                                                        \
@@ -93,6 +94,13 @@ const int num_colors = sizeof(colors) / sizeof(uint32_t);
 
 //TODO: include NCCL headers
 
+#ifdef NCCL_VERSION
+#define NCCL_VERSION_UB NCCL_VERSION(2,19,1)
+#define NCCL_UB_SUPPORT NCCL_VERSION_CODE >= NCCL_VERSION_UB
+#else
+#define NCCL_UB_SUPPORT 0
+#endif
+
 #define NCCL_CALL(call)                                                                     \
     {                                                                                       \
         ncclResult_t  ncclStatus = call;                                                    \
@@ -164,7 +172,14 @@ int main(int argc, char* argv[]) {
     const int nx = get_argval<int>(argv, argv + argc, "-nx", 16384);
     const int ny = get_argval<int>(argv, argv + argc, "-ny", 16384);
     const bool csv = get_arg(argv, argv + argc, "-csv");
+    bool user_buffer_reg = get_arg(argv, argv + argc, "-user_buffer_reg");
 
+#if NCCL_UB_SUPPORT == 0
+    if (user_buffer_reg) {
+        fprintf(stderr,"WARNING: Ignoring -user_buffer_reg, required NCCL APIs are provided by NCCL 2.19.1 or later.\n");
+        user_buffer_reg = false;
+    }
+#endif //NCCL_UB_SUPPORT == 0
     int local_rank = -1;
     {
         MPI_Comm local_comm;
@@ -183,10 +198,25 @@ int main(int argc, char* argv[]) {
 
     //TODO: Create a communicator (ncclComm_t), initialize it (ncclCommInitRank)
 
+
     real* a_ref_h;
-    CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(real)));
     real* a_h;
+#if NCCL_UB_SUPPORT
+    void* a_reg_handle;
+    void* a_new_reg_handle;
+    if (user_buffer_reg) {
+    //TODO: Allocate the memory with ncclMemAlloc and register it for the commmunicatior
+        
+        if ( nccl_version < 22304 ) {
+            fprintf(stderr,"WARNING: -user_buffer_reg available, but Jacobi communication pattern needs NCCL 2.23.4 or later.\n");
+        }
+    }
+    else
+#endif //NCCL_UB_SUPPORT
+    {
+    CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(real)));
     CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(real)));
+    }
     double runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, nccheck, !csv && (0 == rank));
 
     // ny - 2 rows are distributed amongst `size` ranks in such a way
@@ -386,10 +416,16 @@ int main(int argc, char* argv[]) {
 
     CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
     CUDA_RT_CALL(cudaFree(l2_norm_d));
-
+#if NCCL_UB_SUPPORT
+    if (user_buffer_reg) {
+//TODO: Deregister and Free the Buffer
+    }
+    else
+#endif //NCCL_UB_SUPPORT
+    {
     CUDA_RT_CALL(cudaFree(a_new));
     CUDA_RT_CALL(cudaFree(a));
-
+    }
     CUDA_RT_CALL(cudaFreeHost(a_h));
     CUDA_RT_CALL(cudaFreeHost(a_ref_h));