Skip to content

Commit f40ce73

Browse files
committed
NCCL 2.4.6-1
Added detection of IBM/Power NVLink bridge device. Add NUMA support to PCI distance calculations. Added NCCL_IGNORE_CPU_AFFINITY env var. Fix memory leaks; GithubIssue#180 Compiler warning fix; GithubIssue#178 Replace non-standard variable length arrays. GithubIssue#171 Fix Tree+Shared Memory crash. GithubPR#185 Fix LL cleanup hang during long running DL jobs. Fix NCCL_RINGS environment variable handling. Added extra checks to catch repeat calls to ncclCommDestroy() GithubIssue#191 Improve bootstrap socket connection reliability at scale. Fix hostname hashing issue. GithubIssue#187 Code cleanup to rename all non device files from *.cu to *.cc
1 parent 14e0cf6 commit f40ce73

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

81 files changed

+893
-693
lines changed

LICENSE.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11

2-
Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
2+
Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
33

44
Redistribution and use in source and binary forms, with or without
55
modification, are permitted provided that the following conditions

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#
2-
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
2+
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
33
#
44
# See LICENSE.txt for license information
55
#

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -89,4 +89,4 @@ $ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g <ngpus>
8989

9090
## Copyright
9191

92-
All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
92+
All source code and accompanying documentation is copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.

ext-net/dummy/Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#
2-
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
2+
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
33
#
44
# See LICENSE.txt for license information
55
#

ext-net/dummy/plugin.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*************************************************************************
2-
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
33
*
44
* See LICENSE.txt for license information
55
************************************************************************/

makefiles/common.mk

+5-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#
2-
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
2+
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
33
#
44
# See LICENSE.txt for license information
55
#
@@ -15,6 +15,7 @@ PROFAPI ?= 0
1515
NVCC = $(CUDA_HOME)/bin/nvcc
1616

1717
CUDA_LIB ?= $(CUDA_HOME)/lib64
18+
CUDA_INC ?= $(CUDA_HOME)/include
1819
CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
1920
#CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
2021
CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
@@ -43,7 +44,8 @@ endif
4344
#$(info NVCC_GENCODE is ${NVCC_GENCODE})
4445

4546
CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
46-
CXXFLAGS += -Wall -Wno-sign-compare
47+
CXXFLAGS += -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla
48+
CXXFLAGS += -I $(CUDA_INC)
4749
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
4850
# Use addprefix so that we can specify more than one path
4951
NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt
@@ -67,7 +69,7 @@ CXXFLAGS += -O0 -g -ggdb3
6769
endif
6870

6971
ifneq ($(VERBOSE), 0)
70-
NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra
72+
NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
7173
CXXFLAGS += -Wall -Wextra
7274
else
7375
.SILENT:

makefiles/formatting.mk

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#
2-
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
2+
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
33
#
44
# See LICENSE.txt for license information
55
#

makefiles/version.mk

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
##### version
22
NCCL_MAJOR := 2
33
NCCL_MINOR := 4
4-
NCCL_PATCH := 2
4+
NCCL_PATCH := 6
55
NCCL_SUFFIX :=
66
PKG_REVISION := 1

pkg/Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#
2-
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
2+
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
33
#
44
# See LICENSE.txt for license information
55
#

pkg/debian/Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#
2-
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
2+
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
33
#
44
# See LICENSE.txt for license information
55
#

pkg/redhat/Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#
2-
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
2+
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
33
#
44
# See LICENSE.txt for license information
55
#

pkg/srctxz/Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#
2-
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
2+
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
33
#
44
# See LICENSE.txt for license information
55
#

pkg/srctxz/create_srctxz.sh.in

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/bash
22
#
3-
# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3+
# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
44
#
55
# See LICENSE.txt for license information
66
#

pkg/txz/Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#
2-
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
2+
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
33
#
44
# See LICENSE.txt for license information
55
#

pkg/txz/create_txz.sh.in

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/bash
22
#
3-
# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3+
# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
44
#
55
# See LICENSE.txt for license information
66
#

src/Makefile

+10-10
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#
2-
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
2+
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
33
#
44
# See LICENSE.txt for license information
55
#
@@ -9,10 +9,10 @@ include ../makefiles/version.mk
99

1010
##### src files
1111
INCEXPORTS := nccl.h nccl_net.h
12-
LIBSRCFILES := init.cu channel.cu bootstrap.cu transport.cu enqueue.cu \
13-
misc/group.cu misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/checks.cu misc/trees.cu \
14-
transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
15-
collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu
12+
LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc \
13+
misc/group.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/rings.cc misc/utils.cc misc/argcheck.cc misc/trees.cc misc/topo.cc \
14+
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \
15+
collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc
1616

1717
##### lib files
1818
LIBNAME := libnccl.so
@@ -27,7 +27,7 @@ INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%)
2727
LIBSONAME := $(LIBNAME:%=%.$(NCCL_MAJOR))
2828
LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
2929
STATICLIBTARGET := $(STATICLIBNAME)
30-
LIBOBJ := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o)
30+
LIBOBJ := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
3131
DEPFILES := $(LIBOBJ:%.o=%.d)
3232
LDFLAGS += -L${CUDA_LIB} -lcudart_static -lpthread -lrt -ldl
3333

@@ -87,11 +87,11 @@ $(INCDIR)/nccl_%.h : include/nccl_%.h
8787
mkdir -p $(INCDIR)
8888
cp -f $< $@
8989

90-
$(OBJDIR)/%.o : %.cu
90+
$(OBJDIR)/%.o : %.cc
9191
@printf "Compiling %-35s > %s\n" $< $@
9292
mkdir -p `dirname $@`
93-
$(NVCC) -I. -I$(INCDIR) -Iinclude -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
94-
@$(NVCC) -I. -I$(INCDIR) -Iinclude -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
93+
$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@
94+
@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp)
9595
@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
9696
@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
9797
sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
@@ -107,7 +107,7 @@ install : lib
107107
cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
108108
cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
109109

110-
FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cu" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
110+
FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
111111
# Note that formatting.mk defines a new target so in order to not overwrite the default target,
112112
# it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well
113113
# as the BUILDDIR variable.

src/bootstrap.cu src/bootstrap.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*************************************************************************
2-
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
33
*
44
* See LICENSE.txt for license information
55
************************************************************************/

src/channel.cu src/channel.cc

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*************************************************************************
2-
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
33
*
44
* See LICENSE.txt for license information
55
************************************************************************/
@@ -47,5 +47,10 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
4747
if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
4848
if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
4949
}
50+
51+
// Free the peer structures.
52+
CUDACHECK(cudaFree(channel->devPeers));
53+
free(channel->peers);
54+
5055
return ncclSuccess;
5156
}

src/collectives/all_gather.cu src/collectives/all_gather.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*************************************************************************
2-
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
33
*
44
* See LICENSE.txt for license information
55
************************************************************************/

src/collectives/all_reduce.cu src/collectives/all_reduce.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*************************************************************************
2-
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
33
*
44
* See LICENSE.txt for license information
55
************************************************************************/

src/collectives/broadcast.cu src/collectives/broadcast.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*************************************************************************
2-
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
33
*
44
* See LICENSE.txt for license information
55
************************************************************************/

src/collectives/collectives.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*************************************************************************
2-
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
33
*
44
* See LICENSE.txt for license information
55
************************************************************************/

src/collectives/device/Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#
2-
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
2+
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
33
#
44
# See LICENSE.txt for license information
55
#

src/collectives/device/all_gather.cu

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*************************************************************************
2-
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
33
*
44
* See LICENSE.txt for license information
55
************************************************************************/

src/collectives/device/all_gather.h

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
/*************************************************************************
2-
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
33
*
44
* See LICENSE.txt for license information
55
************************************************************************/
66

7-
#include "core.h"
7+
#include "devcomm.h"
88
#include "primitives.h"
99
#include "collectives.h"
1010

@@ -13,7 +13,7 @@ __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
1313
const int tid = threadIdx.x;
1414
const int nthreads = blockDim.x - 1;
1515
const int bid = args->bid;
16-
struct ncclComm* comm = args->comm;
16+
struct ncclDevComm* comm = args->comm;
1717
struct ncclChannel* channel = comm->channels+blockIdx.x;
1818
struct ncclRing* ring = &channel->ring;
1919
const ssize_t size = args->N;
@@ -74,7 +74,7 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
7474
const int tid = threadIdx.x;
7575
const int bid = args->bid;
7676
const int nthreads = args->nThreads;
77-
struct ncclComm* comm = args->comm;
77+
struct ncclDevComm* comm = args->comm;
7878
struct ncclChannel* channel = comm->channels+blockIdx.x;
7979
struct ncclRing* ring = &channel->ring;
8080

src/collectives/device/all_reduce.cu

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*************************************************************************
2-
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
33
*
44
* See LICENSE.txt for license information
55
************************************************************************/

src/collectives/device/all_reduce.h

+6-6
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
/*************************************************************************
2-
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
33
*
44
* See LICENSE.txt for license information
55
************************************************************************/
66

7-
#include "core.h"
7+
#include "devcomm.h"
88
#include "primitives.h"
99
#include "collectives.h"
1010

@@ -13,7 +13,7 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
1313
const int tid = threadIdx.x;
1414
const int nthreads = blockDim.x - 1;
1515
const int bid = args->bid;
16-
struct ncclComm* comm = args->comm;
16+
struct ncclDevComm* comm = args->comm;
1717
struct ncclChannel* channel = comm->channels+blockIdx.x;
1818
struct ncclRing* ring = &channel->ring;
1919
const ssize_t size = args->N;
@@ -87,7 +87,7 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
8787
const int tid = threadIdx.x;
8888
const int nthreads = blockDim.x - 1;
8989
const int bid = args->bid;
90-
struct ncclComm* comm = args->comm;
90+
struct ncclDevComm* comm = args->comm;
9191
struct ncclChannel* channel = comm->channels+blockIdx.x;
9292
struct ncclTree* tree = &channel->tree;
9393
const ssize_t size = args->N;
@@ -139,7 +139,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
139139
const int tid = threadIdx.x;
140140
const int bid = args->bid;
141141
const int nthreads = args->nThreads;
142-
struct ncclComm* comm = args->comm;
142+
struct ncclDevComm* comm = args->comm;
143143
struct ncclChannel* channel = comm->channels+blockIdx.x;
144144
struct ncclRing* ring = &channel->ring;
145145

@@ -214,7 +214,7 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
214214
const int tid = threadIdx.x;
215215
const int nthreads = args->nThreads;
216216
const int bid = args->bid;
217-
struct ncclComm* comm = args->comm;
217+
struct ncclDevComm* comm = args->comm;
218218
struct ncclChannel* channel = comm->channels+blockIdx.x;
219219
struct ncclTree* tree = &channel->tree;
220220
const ssize_t size = args->N;

src/collectives/device/broadcast.cu

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*************************************************************************
2-
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
33
*
44
* See LICENSE.txt for license information
55
************************************************************************/

src/collectives/device/broadcast.h

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
/*************************************************************************
2-
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
33
*
44
* See LICENSE.txt for license information
55
************************************************************************/
66

7-
#include "core.h"
7+
#include "devcomm.h"
88
#include "primitives.h"
99
#include "collectives.h"
1010

@@ -13,7 +13,7 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
1313
const int tid = threadIdx.x;
1414
const int nthreads = blockDim.x - 1;
1515
const int bid = args->bid;
16-
struct ncclComm* comm = args->comm;
16+
struct ncclDevComm* comm = args->comm;
1717
struct ncclChannel* channel = comm->channels+blockIdx.x;
1818
struct ncclRing* ring = &channel->ring;
1919
const ssize_t size = args->N;
@@ -59,7 +59,7 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
5959
const int tid = threadIdx.x;
6060
const int bid = args->bid;
6161
const int nthreads = args->nThreads;
62-
struct ncclComm* comm = args->comm;
62+
struct ncclDevComm* comm = args->comm;
6363
struct ncclChannel* channel = comm->channels+blockIdx.x;
6464
struct ncclRing* ring = &channel->ring;
6565

0 commit comments

Comments
 (0)