Updated torch c++ to use new aten api (horovod#3175)

tgaddair · web-flow · commit 464385f26aaa · 2021-10-03T18:43:16.000-07:00
diff --git a/.buildkite/gen-pipeline.sh b/.buildkite/gen-pipeline.sh
@@ -45,7 +45,7 @@ tests=$(if [[ -n "${PIPELINE_MODE:-}" ]] && ( [[ "${BUILDKITE_BRANCH:-}" == "${B
   printf "test-cpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_1_2 "
 
   # then we vary the frameworks for gpu
-  printf "test-gpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_3_1-mxnet1_5_1_p0-pyspark3_1_2 "
+  printf "test-gpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_1_p0-pyspark3_1_2 "
   # this is required as we cannot test mxnet-1.6.0.post0 with cpu
   printf "test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_7_1-mxnet1_6_0_p0-pyspark3_1_2 "
   # we additionally test the previous framework combination (CUDA 10.x) with mxnet 1.7.x
diff --git a/.github/gen-workflow-ci.py b/.github/gen-workflow-ci.py
@@ -368,34 +368,34 @@ def build_and_test_macos(id: str, name: str, needs: List[str], attempts: int = 3
                 f'      matrix:\n'
                 f'        include:\n'
                 f''
-                f'          - image: test-cpu-openmpi-py3_7-tf1_15_5-keras2_2_4-torch1_2_0-mxnet1_5_0\n'
+                f'          - image: test-cpu-openmpi-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_0\n'
                 f'            HOROVOD_WITH_MPI: 1\n'
                 f'            HOROVOD_WITHOUT_GLOO: 1\n'
                 f'            TENSORFLOW: 1.15.0\n'
                 f'            KERAS: 2.2.4\n'
-                f'            PYTORCH: 1.2.0\n'
-                f'            PYTORCH_LIGHTNING: 0.7.6\n'
-                f'            TORCHVISION: 0.4.0\n'
+                f'            PYTORCH: 1.6.0\n'
+                f'            PYTORCH_LIGHTNING: 1.3.8\n'
+                f'            TORCHVISION: 0.7.0\n'
                 f'            MXNET: 1.5.0\n'
                 f'\n'
-                f'          - image: test-cpu-gloo-py3_8-tf2_2_0-keras2_3_1-torch1_5_0-mxnet1_5_0\n'
+                f'          - image: test-cpu-gloo-py3_8-tf2_2_0-keras2_3_1-torch1_8_1-mxnet1_5_0\n'
                 f'            HOROVOD_WITHOUT_MPI: 1\n'
                 f'            HOROVOD_WITH_GLOO: 1\n'
                 f'            TENSORFLOW: 2.2.0\n'
                 f'            KERAS: 2.3.1\n'
-                f'            PYTORCH: 1.5.0\n'
-                f'            PYTORCH_LIGHTNING: 1.2.9\n'
-                f'            TORCHVISION: 0.6.0\n'
+                f'            PYTORCH: 1.8.1\n'
+                f'            PYTORCH_LIGHTNING: 1.3.8\n'
+                f'            TORCHVISION: 0.9.1\n'
                 f'            MXNET: 1.5.0\n'
                 f'\n'
-                f'          - image: test-openmpi-cpu-gloo-py3_8-tf2_3_0-keras2_3_1-torch1_6_0-mxnet1_5_0\n'
+                f'          - image: test-openmpi-cpu-gloo-py3_8-tf2_3_0-keras2_3_1-torch1_9_0-mxnet1_5_0\n'
                 f'            HOROVOD_WITH_MPI: 1\n'
                 f'            HOROVOD_WITH_GLOO: 1\n'
                 f'            TENSORFLOW: 2.3.0\n'
                 f'            KERAS: 2.3.1\n'
-                f'            PYTORCH: 1.6.0\n'
-                f'            PYTORCH_LIGHTNING: 1.2.9\n'
-                f'            TORCHVISION: 0.7.0\n'
+                f'            PYTORCH: 1.9.0\n'
+                f'            PYTORCH_LIGHTNING: 1.3.8\n'
+                f'            TORCHVISION: 0.10.0\n'
                 f'            MXNET: 1.5.0\n'
                 f'\n'
                 f'    steps:\n'
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -234,7 +234,7 @@ jobs:
             Spark_Torch_MNIST: true
             build_timeout: 30
 
-          - image: test-gpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_3_1-mxnet1_5_1_p0-pyspark3_1_2
+          - image: test-gpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_1_p0-pyspark3_1_2
             build_timeout: 40
 
           - image: test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_7_1-mxnet1_6_0_p0-pyspark3_1_2
@@ -3279,34 +3279,34 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - image: test-cpu-openmpi-py3_7-tf1_15_5-keras2_2_4-torch1_2_0-mxnet1_5_0
+          - image: test-cpu-openmpi-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_0
             HOROVOD_WITH_MPI: 1
             HOROVOD_WITHOUT_GLOO: 1
             TENSORFLOW: 1.15.0
             KERAS: 2.2.4
-            PYTORCH: 1.2.0
-            PYTORCH_LIGHTNING: 0.7.6
-            TORCHVISION: 0.4.0
+            PYTORCH: 1.6.0
+            PYTORCH_LIGHTNING: 1.3.8
+            TORCHVISION: 0.7.0
             MXNET: 1.5.0
 
-          - image: test-cpu-gloo-py3_8-tf2_2_0-keras2_3_1-torch1_5_0-mxnet1_5_0
+          - image: test-cpu-gloo-py3_8-tf2_2_0-keras2_3_1-torch1_8_1-mxnet1_5_0
             HOROVOD_WITHOUT_MPI: 1
             HOROVOD_WITH_GLOO: 1
             TENSORFLOW: 2.2.0
             KERAS: 2.3.1
-            PYTORCH: 1.5.0
-            PYTORCH_LIGHTNING: 1.2.9
-            TORCHVISION: 0.6.0
+            PYTORCH: 1.8.1
+            PYTORCH_LIGHTNING: 1.3.8
+            TORCHVISION: 0.9.1
             MXNET: 1.5.0
 
-          - image: test-openmpi-cpu-gloo-py3_8-tf2_3_0-keras2_3_1-torch1_6_0-mxnet1_5_0
+          - image: test-openmpi-cpu-gloo-py3_8-tf2_3_0-keras2_3_1-torch1_9_0-mxnet1_5_0
             HOROVOD_WITH_MPI: 1
             HOROVOD_WITH_GLOO: 1
             TENSORFLOW: 2.3.0
             KERAS: 2.3.1
-            PYTORCH: 1.6.0
-            PYTORCH_LIGHTNING: 1.2.9
-            TORCHVISION: 0.7.0
+            PYTORCH: 1.9.0
+            PYTORCH_LIGHTNING: 1.3.8
+            TORCHVISION: 0.10.0
             MXNET: 1.5.0
 
     steps:
diff --git a/.gitmodules b/.gitmodules
@@ -3,7 +3,7 @@
 	url = https://github.com/yixuan/LBFGSpp.git
 [submodule "third_party/eigen"]
 	path = third_party/eigen
-	url = https://gitlab.com/libeigen/eigen.git
+	url = https://gitlab.com/cantonios/eigen.git
 [submodule "third_party/boost/assert"]
 	path = third_party/boost/assert
 	url = https://github.com/boostorg/assert.git
diff --git a/Jenkinsfile.ppc64le b/Jenkinsfile.ppc64le
@@ -1,12 +1,12 @@
 pipeline {
     options {
         buildDiscarder(logRotator(numToKeepStr: '30'))
-        timeout(time: 15, unit: 'MINUTES')
+        timeout(time: 30, unit: 'MINUTES')
     }
     agent {
         docker {
             alwaysPull true
-            // WMLCE 1.7.0 has CUDA 10.2, NCCL 2.5.6, TensorFlow 2.1.0, and PyTorch 1.3.1
+            // WMLCE 1.7.0 has CUDA 10.2, NCCL 2.5.6, TensorFlow 2.1.0, and PyTorch 1.8.0
             image 'tensorflowppc64le/tensorflow-ppc64le:osuosl-ubuntu-horovod-wlmce1.7.0-py3.7-ppc64le'
             args '--cap-add=SYS_PTRACE --shm-size=256g'
             label 'power8-gpu'
@@ -27,7 +27,7 @@ pipeline {
                       conda activate ${CONDA_ENV}
                       conda install -y cmake make
                       set -xe
-                      HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITHOUT_GLOO=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_TENSORFLOW=1 \
+                      HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITHOUT_GLOO=1 HOROVOD_WITHOUT_PYTORCH=1 HOROVOD_WITH_TENSORFLOW=1 \
                           HOROVOD_CUDA_HOME=$CONDA_PREFIX HOROVOD_GPU_OPERATIONS=NCCL MAKEFLAGS="-j1" \
                           pip install -v . --no-cache-dir --no-deps
                 '''
@@ -47,7 +47,7 @@ pipeline {
                           horovodrun -n 1 -H localhost:1 --mpi-args="-pami_noib" pytest -k 'multi_gpu' -v -s test/parallel/test_tensorflow.py
 
                           # PyTorch unit tests
-                          horovodrun -n 2 -H localhost:2 --mpi-args="-pami_noib" pytest -v -s test/parallel/test_torch.py
+                          # horovodrun -n 2 -H localhost:2 --mpi-args="-pami_noib" pytest -v -s test/parallel/test_torch.py
                     '''
                 }
             }
diff --git a/docker-compose.test.yml b/docker-compose.test.yml
@@ -120,8 +120,9 @@ services:
     privileged: true
     shm_size: 8gb
 
-  # torch==1.3.1+cu100 requires torchvision==0.4.2+cu100
-  test-gpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_3_1-mxnet1_5_1_p0-pyspark3_1_2:
+  # okay to mix cuda 10.0 and 10.1 here as pytorch ships its own cuda libs
+  # torch==1.6.0+cu101 requires torchvision==0.7.0+cu101
+  test-gpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_1_p0-pyspark3_1_2:
     extends: test-gpu-base
     build:
       args:
@@ -131,9 +132,9 @@ services:
         PYTHON_VERSION: 3.7
         TENSORFLOW_PACKAGE: tensorflow-gpu==1.15.5
         KERAS_PACKAGE: keras==2.2.4
-        PYTORCH_PACKAGE: torch==1.3.1+cu100
-        PYTORCH_LIGHTNING_PACKAGE: pytorch_lightning==1.1.0
-        TORCHVISION_PACKAGE: torchvision==0.4.2+cu100
+        PYTORCH_PACKAGE: torch==1.6.0+cu101
+        PYTORCH_LIGHTNING_PACKAGE: pytorch_lightning==1.3.8
+        TORCHVISION_PACKAGE: torchvision==0.7.0+cu101
         MXNET_PACKAGE: mxnet-cu100==1.5.1.post0
   # this is required as we cannot test mxnet-1.6.0.post0 with cpu
   test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_7_1-mxnet1_6_0_p0-pyspark3_1_2:
diff --git a/horovod/torch/adapter_v2.cc b/horovod/torch/adapter_v2.cc
@@ -46,9 +46,9 @@ TorchPersistentBuffer::TorchPersistentBuffer(int device, int64_t size)
     : device_(device) {
   with_device device_context(device_);
   if (device_ == CPU_DEVICE_ID) {
-    tensor_ = ::torch::empty(size, ::torch::device(::torch::kCPU).dtype(::torch::kByte));
+    tensor_ = ::torch::empty({size}, ::torch::device(::torch::kCPU).dtype(::torch::kByte));
   } else {
-    tensor_ = ::torch::empty(size, ::torch::device(::torch::kCUDA).dtype(::torch::kByte));
+    tensor_ = ::torch::empty({size}, ::torch::device(::torch::kCUDA).dtype(::torch::kByte));
   }
 }
 
diff --git a/horovod/torch/mpi_ops_v2.cc b/horovod/torch/mpi_ops_v2.cc
@@ -361,7 +361,7 @@ int DoAllgatherCudaOnCPU(::torch::Tensor tensor, ::torch::Tensor output,
   ready_event_list.AddReadyEvent(RecordReadyEvent(device));
 #endif
 
-  auto cpu_output = ::torch::empty_like(cpu_tensor);
+  auto cpu_output = ::torch::empty_like(cpu_tensor, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   auto hvd_cpu_output = std::make_shared<TorchTensor>(cpu_output);
   auto hvd_context =
       std::make_shared<TorchOpContext>(CPU_DEVICE_ID, cpu_output);
@@ -478,7 +478,7 @@ int DoAlltoall(::torch::Tensor tensor, ::torch::Tensor splits,
   // Deal with possibility of output_received_splits being on GPU
   auto received_splits_device = GetDeviceID(output_received_splits);
   auto cpu_received_splits = (received_splits_device != CPU_DEVICE_ID)
-                                 ? ::torch::empty_like(cpu_splits)
+                                 ? ::torch::empty_like(cpu_splits, LEGACY_CONTIGUOUS_MEMORY_FORMAT)
                                  : output_received_splits;
   auto hvd_context = std::make_shared<TorchOpContext>(device, output);
   hvd_context->AddOutput(CPU_DEVICE_ID, cpu_received_splits);
@@ -531,13 +531,13 @@ int DoAlltoallCudaOnCPU(::torch::Tensor tensor, ::torch::Tensor splits,
   ready_event_list.AddReadyEvent(RecordReadyEvent(device));
 #endif
 
-  auto cpu_output = ::torch::empty_like(cpu_tensor);
+  auto cpu_output = ::torch::empty_like(cpu_tensor, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   auto hvd_cpu_output = std::make_shared<TorchTensor>(cpu_output);
 
   // Deal with possibility of output_received_splits being on GPU
   auto received_splits_device = GetDeviceID(output_received_splits);
   auto cpu_received_splits = (received_splits_device != CPU_DEVICE_ID)
-                                 ? ::torch::empty_like(cpu_splits)
+                                 ? ::torch::empty_like(cpu_splits, LEGACY_CONTIGUOUS_MEMORY_FORMAT)
                                  : output_received_splits;
   auto hvd_context =
       std::make_shared<TorchOpContext>(CPU_DEVICE_ID, cpu_output);
diff --git a/test/parallel/test_torch.py b/test/parallel/test_torch.py
@@ -42,6 +42,7 @@
 from common import mpi_env_rank_and_size, skip_or_fail_gpu_test, temppath
 
 _1_5_api = LooseVersion(torch.__version__) >= LooseVersion('1.5.0')
+_1_10_api = LooseVersion(torch.__version__) >= LooseVersion('1.10.0')
 
 ccl_supported_types = set([torch.ByteTensor, torch.CharTensor, torch.ShortTensor,
                            torch.IntTensor, torch.LongTensor, torch.FloatTensor,
@@ -62,6 +63,11 @@ def __init__(self, *args, **kwargs):
         super(TorchTests, self).__init__(*args, **kwargs)
         warnings.simplefilter('module')
 
+    def tearDown(self):
+        if _1_10_api and hvd.is_initialized():
+            # To fix https://github.com/horovod/horovod/issues/3149
+            hvd.join()
+
     def convert_cpu_fp16_to_fp32(self, *values):
         # PyTorch doesn't support any CPU ops on FP16 tensors.
         # In case we need to do ops, we will convert tensor to FP32 here.
@@ -612,9 +618,6 @@ def test_horovod_allreduce_duplicate_name_error(self):
             assert False, 'hvd.allreduce_async did not throw error'
         except (torch.FatalError, ValueError):
             pass
-        if LooseVersion(torch.__version__) >= LooseVersion('1.10.0'):
-            # To fix https://github.com/horovod/horovod/issues/3149
-            hvd.join()
 
     def test_horovod_allreduce_grad(self):
         """Test the correctness of the allreduce gradient."""
@@ -1221,9 +1224,6 @@ def test_horovod_allgather_duplicate_name_error(self):
             assert False, 'hvd.allgather_async did not throw error'
         except (torch.FatalError, ValueError):
             pass
-        if LooseVersion(torch.__version__) >= LooseVersion('1.10.0'):
-            # To fix https://github.com/horovod/horovod/issues/3149
-            hvd.join()
 
     def test_horovod_allgather_grad(self):
         """Test the correctness of the allgather gradient."""
@@ -1534,9 +1534,6 @@ def test_horovod_broadcast_duplicate_name_error(self):
             assert False, 'hvd.broadcast_async did not throw error'
         except (torch.FatalError, ValueError):
             pass
-        if LooseVersion(torch.__version__) >= LooseVersion('1.10.0'):
-            # To fix https://github.com/horovod/horovod/issues/3149
-            hvd.join()
 
     def test_horovod_broadcast_grad(self):
         """Test the correctness of the broadcast gradient."""
diff --git a/test/single/data/expected_buildkite_gpu_non_heads_pipeline.yaml b/test/single/data/expected_buildkite_gpu_non_heads_pipeline.yaml

Original file line number	Diff line number	Diff line change
`@@ -46,9 +46,9 @@ TorchPersistentBuffer::TorchPersistentBuffer(int device, int64_t size)`
`46`	`46`	`: device_(device) {`
`47`	`47`	`with_device device_context(device_);`
`48`	`48`	`if (device_ == CPU_DEVICE_ID) {`
`49`		`- tensor_ = ::torch::empty(size, ::torch::device(::torch::kCPU).dtype(::torch::kByte));`
	`49`	`+ tensor_ = ::torch::empty({size}, ::torch::device(::torch::kCPU).dtype(::torch::kByte));`
`50`	`50`	`} else {`
`51`		`- tensor_ = ::torch::empty(size, ::torch::device(::torch::kCUDA).dtype(::torch::kByte));`
	`51`	`+ tensor_ = ::torch::empty({size}, ::torch::device(::torch::kCUDA).dtype(::torch::kByte));`
`52`	`52`	`}`
`53`	`53`	`}`
`54`	`54`