Skip to content

Commit 164f044

Browse files
rootroot
authored andcommitted
Edit .master ro have this correct in the futur
Leave out the ipynb file as this does not work
1 parent e1e2899 commit 164f044

File tree

6 files changed

+80
-27
lines changed

6 files changed

+80
-27
lines changed

08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ The purpose of this task is to use NCCL instead of MPI to implement a multi-GPU
2121
- Fix output message to indicate nccl rather than mpi
2222
- Destroy NCCL communicator
2323

24+
If you have time left:
25+
- Use ncclMemAlloc to allocate the buffers and register them for communication
26+
- Don`t forget to deregister and free the buffers correctly
27+
2428
Compile with
2529

2630
``` {.bash}

08-H_NCCL_NVSHMEM/.master/NCCL/Makefile.in

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,5 +42,8 @@ sanitize: jacobi
4242
run: jacobi
4343
$(JSC_SUBMIT_CMD) -n $(NP) ./jacobi
4444

45+
run_user_buffer: jacobi
46+
$(JSC_SUBMIT_CMD) -n $(NP) ./jacobi -user_buffer_reg
47+
4548
profile: jacobi
4649
$(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o $(OUTPUT_NAME).%q{SLURM_PROCID} ./jacobi -niter 10

08-H_NCCL_NVSHMEM/.master/NCCL/jacobi.cpp

Lines changed: 50 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,13 @@ const int num_colors = sizeof(colors) / sizeof(uint32_t);
9595
#ifdef SOLUTION
9696
#include <nccl.h>
9797
#endif
98+
#ifdef NCCL_VERSION
99+
#define NCCL_VERSION_UB NCCL_VERSION(2,19,1)
100+
#define NCCL_UB_SUPPORT NCCL_VERSION_CODE >= NCCL_VERSION_UB
101+
#else
102+
#define NCCL_UB_SUPPORT 0
103+
#endif
104+
98105

99106
#define NCCL_CALL(call) \
100107
{ \
@@ -172,6 +179,13 @@ int main(int argc, char* argv[]) {
172179
const int nx = get_argval<int>(argv, argv + argc, "-nx", 16384);
173180
const int ny = get_argval<int>(argv, argv + argc, "-ny", 16384);
174181
const bool csv = get_arg(argv, argv + argc, "-csv");
182+
bool user_buffer_reg = get_arg(argv, argv + argc, "-user_buffer_reg");
183+
#if NCCL_UB_SUPPORT == 0
184+
if (user_buffer_reg) {
185+
fprintf(stderr,"WARNING: Ignoring -user_buffer_reg, required NCCL APIs are provided by NCCL 2.19.1 or later.\n");
186+
user_buffer_reg = false;
187+
}
188+
#endif //NCCL_UB_SUPPORT == 0
175189

176190
int local_rank = -1;
177191
{
@@ -226,10 +240,30 @@ int main(int argc, char* argv[]) {
226240
chunk_size = chunk_size_high;
227241

228242
real* a;
229-
CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real)));
230243
real* a_new;
231-
CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(real)));
244+
#if NCCL_UB_SUPPORT
245+
void* a_reg_handle;
246+
void* a_new_reg_handle;
247+
if (user_buffer_reg) {
248+
//TODO: Allocate the memory with ncclMemAlloc and register it for the commmunicatior
249+
#ifdef SOLUTION
250+
251+
NCCL_CALL(ncclMemAlloc( (void**) &a , nx * (chunk_size + 2) * sizeof(real)));
252+
NCCL_CALL(ncclMemAlloc( (void**) &a_new, nx * (chunk_size + 2) * sizeof(real)));
253+
NCCL_CALL(ncclCommRegister(nccl_comm, a , nx * (chunk_size + 2) * sizeof(real), &a_reg_handle));
254+
NCCL_CALL(ncclCommRegister(nccl_comm, a_new, nx * (chunk_size + 2) * sizeof(real), &a_new_reg_handle));
255+
#endif
256+
if ( nccl_version < 22304 ) {
257+
fprintf(stderr,"WARNING: -user_buffer_reg available, but Jacobi communication pattern needs NCCL 2.23.4 or later.\n");
258+
}
259+
}
260+
else
261+
#endif //NCCL_UB_SUPPORT
232262

263+
{
264+
CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real)));
265+
CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(real)));
266+
}
233267
CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(real)));
234268
CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(real)));
235269

@@ -434,10 +468,22 @@ int main(int argc, char* argv[]) {
434468

435469
CUDA_RT_CALL(cudaFreeHost(l2_norm_h));
436470
CUDA_RT_CALL(cudaFree(l2_norm_d));
437-
471+
#if NCCL_UB_SUPPORT
472+
if (user_buffer_reg) {
473+
//TODO: Deregister and Free the Buffer
474+
#ifdef SOLUTION
475+
NCCL_CALL(ncclCommDeregister(nccl_comm, a_new_reg_handle));
476+
NCCL_CALL(ncclCommDeregister(nccl_comm, a_reg_handle));
477+
NCCL_CALL(ncclMemFree(a_new));
478+
NCCL_CALL(ncclMemFree(a));
479+
#endif
480+
}
481+
else
482+
#endif //NCCL_UB_SUPPORT
483+
{
438484
CUDA_RT_CALL(cudaFree(a_new));
439485
CUDA_RT_CALL(cudaFree(a));
440-
486+
}
441487
CUDA_RT_CALL(cudaFreeHost(a_h));
442488
CUDA_RT_CALL(cudaFreeHost(a_ref_h));
443489

08-H_NCCL_NVSHMEM/solutions/NCCL/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved.
2-
THIS_TASK := 08H-NCCL-task
2+
THIS_TASK := 08H-NCCL-sol
33
OUTPUT_NAME := jacobi.$(THIS_TASK)__$(shell date '+%Y%m%d-%H%M')
44
NP ?= 1
55
NVCC=nvcc

08-H_NCCL_NVSHMEM/solutions/NCCL/jacobi.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ const int num_colors = sizeof(colors) / sizeof(uint32_t);
100100
#define NCCL_UB_SUPPORT 0
101101
#endif
102102

103+
103104
#define NCCL_CALL(call) \
104105
{ \
105106
ncclResult_t ncclStatus = call; \
@@ -181,6 +182,7 @@ int main(int argc, char* argv[]) {
181182
user_buffer_reg = false;
182183
}
183184
#endif //NCCL_UB_SUPPORT == 0
185+
184186
int local_rank = -1;
185187
{
186188
MPI_Comm local_comm;
@@ -233,12 +235,12 @@ int main(int argc, char* argv[]) {
233235

234236
real* a;
235237
real* a_new;
236-
237238
#if NCCL_UB_SUPPORT
238239
void* a_reg_handle;
239240
void* a_new_reg_handle;
240241
if (user_buffer_reg) {
241242
//TODO: Allocate the memory with ncclMemAlloc and register it for the commmunicatior
243+
242244
NCCL_CALL(ncclMemAlloc( (void**) &a , nx * (chunk_size + 2) * sizeof(real)));
243245
NCCL_CALL(ncclMemAlloc( (void**) &a_new, nx * (chunk_size + 2) * sizeof(real)));
244246
NCCL_CALL(ncclCommRegister(nccl_comm, a , nx * (chunk_size + 2) * sizeof(real), &a_reg_handle));
@@ -249,6 +251,7 @@ int main(int argc, char* argv[]) {
249251
}
250252
else
251253
#endif //NCCL_UB_SUPPORT
254+
252255
{
253256
CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real)));
254257
CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(real)));

08-H_NCCL_NVSHMEM/tasks/NCCL/jacobi.cpp

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@
3131
#include <sstream>
3232

3333
#include <mpi.h>
34-
3534
#define MPI_CALL(call) \
3635
{ \
3736
int mpi_status = call; \
@@ -93,14 +92,14 @@ const int num_colors = sizeof(colors) / sizeof(uint32_t);
9392
}
9493

9594
//TODO: include NCCL headers
96-
9795
#ifdef NCCL_VERSION
9896
#define NCCL_VERSION_UB NCCL_VERSION(2,19,1)
9997
#define NCCL_UB_SUPPORT NCCL_VERSION_CODE >= NCCL_VERSION_UB
10098
#else
10199
#define NCCL_UB_SUPPORT 0
102100
#endif
103101

102+
104103
#define NCCL_CALL(call) \
105104
{ \
106105
ncclResult_t ncclStatus = call; \
@@ -173,13 +172,13 @@ int main(int argc, char* argv[]) {
173172
const int ny = get_argval<int>(argv, argv + argc, "-ny", 16384);
174173
const bool csv = get_arg(argv, argv + argc, "-csv");
175174
bool user_buffer_reg = get_arg(argv, argv + argc, "-user_buffer_reg");
176-
177175
#if NCCL_UB_SUPPORT == 0
178176
if (user_buffer_reg) {
179177
fprintf(stderr,"WARNING: Ignoring -user_buffer_reg, required NCCL APIs are provided by NCCL 2.19.1 or later.\n");
180178
user_buffer_reg = false;
181179
}
182180
#endif //NCCL_UB_SUPPORT == 0
181+
183182
int local_rank = -1;
184183
{
185184
MPI_Comm local_comm;
@@ -198,25 +197,10 @@ int main(int argc, char* argv[]) {
198197

199198
//TODO: Create a communicator (ncclComm_t), initialize it (ncclCommInitRank)
200199

201-
202200
real* a_ref_h;
203-
real* a_h;
204-
#if NCCL_UB_SUPPORT
205-
void* a_reg_handle;
206-
void* a_new_reg_handle;
207-
if (user_buffer_reg) {
208-
//TODO: Allocate the memory with ncclMemAlloc and register it for the commmunicatior
209-
210-
if ( nccl_version < 22304 ) {
211-
fprintf(stderr,"WARNING: -user_buffer_reg available, but Jacobi communication pattern needs NCCL 2.23.4 or later.\n");
212-
}
213-
}
214-
else
215-
#endif //NCCL_UB_SUPPORT
216-
{
217201
CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(real)));
202+
real* a_h;
218203
CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(real)));
219-
}
220204
double runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, nccheck, !csv && (0 == rank));
221205

222206
// ny - 2 rows are distributed amongst `size` ranks in such a way
@@ -236,10 +220,23 @@ int main(int argc, char* argv[]) {
236220
chunk_size = chunk_size_high;
237221

238222
real* a;
239-
CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real)));
240223
real* a_new;
241-
CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(real)));
224+
#if NCCL_UB_SUPPORT
225+
void* a_reg_handle;
226+
void* a_new_reg_handle;
227+
if (user_buffer_reg) {
228+
//TODO: Allocate the memory with ncclMemAlloc and register it for the commmunicatior
229+
if ( nccl_version < 22304 ) {
230+
fprintf(stderr,"WARNING: -user_buffer_reg available, but Jacobi communication pattern needs NCCL 2.23.4 or later.\n");
231+
}
232+
}
233+
else
234+
#endif //NCCL_UB_SUPPORT
242235

236+
{
237+
CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real)));
238+
CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(real)));
239+
}
243240
CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(real)));
244241
CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(real)));
245242

0 commit comments

Comments
 (0)