Add an option to pin to gpu for all estimators (horovod#3526)

TJ Xu · web-flow · commit 3926a01429df · 2022-04-30T08:58:47.000-07:00
* Add an option to pin to gpu for all estimators

* Fix CI by downloading nv keys directly
diff --git a/Dockerfile.test.gpu b/Dockerfile.test.gpu
@@ -29,6 +29,13 @@ ENV DEBIAN_FRONTEND=noninteractive
 # Set default shell to /bin/bash
 SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
 
+# Extract ubuntu distribution version and download the corresponding key.
+# This is to fix CI failures caused by the new rotating key mechanism rolled out by Nvidia.
+# Refer to https://forums.developer.nvidia.com/t/notice-cuda-linux-repository-key-rotation/212771 for more details.
+RUN DIST=$(echo ${CUDA_DOCKER_VERSION#*ubuntu} | sed 's/\.//'); \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${DIST}/x86_64/3bf863cc.pub && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu${DIST}/x86_64/7fa2af80.pub
+
 # Prepare to install specific g++ versions
 RUN apt-get update -qq && apt-get install -y --no-install-recommends software-properties-common
 RUN add-apt-repository ppa:ubuntu-toolchain-r/test
diff --git a/docker/horovod-ray/Dockerfile b/docker/horovod-ray/Dockerfile
@@ -16,6 +16,13 @@ ENV DEBIAN_FRONTEND=noninteractive
 # Set default shell to /bin/bash
 SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
 
+# Download the corresponding key for ubuntu1804.
+# This is to fix CI failures caused by the new rotating key mechanism rolled out by Nvidia.
+# Refer to https://forums.developer.nvidia.com/t/notice-cuda-linux-repository-key-rotation/212771 for more details.
+ARG APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1
+RUN sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+RUN sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+
 RUN sudo apt-get update && DEBIAN_FRONTEND="noninteractive" sudo apt-get install -y \
         build-essential \
         cmake \
diff --git a/docker/horovod/Dockerfile b/docker/horovod/Dockerfile
@@ -24,6 +24,13 @@ ENV DEBIAN_FRONTEND=noninteractive
 # Set default shell to /bin/bash
 SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
 
+# Extract ubuntu distribution version and download the corresponding key.
+# This is to fix CI failures caused by the new rotating key mechanism rolled out by Nvidia.
+# Refer to https://forums.developer.nvidia.com/t/notice-cuda-linux-repository-key-rotation/212771 for more details.
+RUN DIST=$(echo ${CUDA_DOCKER_VERSION#*ubuntu} | sed 's/\.//'); \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${DIST}/x86_64/3bf863cc.pub && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu${DIST}/x86_64/7fa2af80.pub
+
 RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
         build-essential \
         cmake \
diff --git a/horovod/spark/common/params.py b/horovod/spark/common/params.py
@@ -96,6 +96,16 @@ class EstimatorParams(Params):
 
     label_shapes = Param(Params._dummy(), 'label_shapes', 'specifies the shape (or shapes) of the label column (or columns)')
 
+    inmemory_cache_all = Param(Params._dummy(), 'inmemory_cache_all',
+                               'Cache the data in memory for training and validation.',
+                               typeConverter=TypeConverters.toBoolean)
+
+    use_gpu = Param(Params._dummy(), 'use_gpu',
+                    'Whether to use the GPU for training. '
+                    'Setting this to False will skipping binding to GPU even when GPU is available. '
+                    'Defaults to True.',
+                    typeConverter=TypeConverters.toBoolean)
+
     def __init__(self):
         super(EstimatorParams, self).__init__()
 
@@ -129,7 +139,9 @@ def __init__(self):
             train_reader_num_workers=2,
             val_reader_num_workers=2,
             reader_pool_type='process',
-            label_shapes=None)
+            label_shapes=None,
+            inmemory_cache_all=False,
+            use_gpu=True)
 
     def _check_params(self, metadata):
         model = self.getModel()
@@ -334,6 +346,17 @@ def setLabelShapes(self, value):
     def getLabelShapes(self):
         return self.getOrDefault(self.label_shapes)
 
+    def setInMemoryCacheAll(self, value):
+        return self._set(inmemory_cache_all=value)
+
+    def getInMemoryCacheAll(self):
+        return self.getOrDefault(self.inmemory_cache_all)
+
+    def setUseGpu(self, value):
+        self._set(use_gpu=value)
+
+    def getUseGpu(self):
+        return self.getOrDefault(self.use_gpu)
 
 class ModelParams(HasOutputCols):
     history = Param(Params._dummy(), 'history', 'history')
diff --git a/horovod/spark/keras/estimator.py b/horovod/spark/keras/estimator.py
@@ -147,14 +147,12 @@ class KerasEstimator(HorovodEstimator, KerasEstimatorParamsReadable,
         inmemory_cache_all: boolean value. Cache the data in memory for training and validation. Default: False.
         backend_env: dict to add to the environment of the backend.  Defaults to setting the java heap size to
                      2G min and max for libhdfs through petastorm
+        use_gpu: Whether to use the GPU for training. Defaults to True.
     """
 
     custom_objects = Param(Params._dummy(), 'custom_objects', 'custom objects')
     checkpoint_callback = Param(Params._dummy(), 'checkpoint_callback',
                                 'model checkpointing callback')
-    inmemory_cache_all = Param(Params._dummy(), 'inmemory_cache_all',
-                               'Cache the data in memory for training and validation.',
-                               typeConverter=TypeConverters.toBoolean)
     backend_env = Param(Params._dummy(), "backend_env",
                         "dict to add to the environment of the command run on the environment")
 
@@ -192,14 +190,14 @@ def __init__(self,
                  label_shapes=None,
                  checkpoint_callback=None,
                  inmemory_cache_all=False,
-                 backend_env=None):
+                 backend_env=None,
+                 use_gpu=True):
 
         super(KerasEstimator, self).__init__()
 
         self._setDefault(optimizer=None,
                          custom_objects={},
                          checkpoint_callback=None,
-                         inmemory_cache_all=False,
                          backend_env={'LIBHDFS_OPTS': '-Xms2048m -Xmx2048m'})
 
         kwargs = self._input_kwargs
@@ -235,12 +233,6 @@ def setCheckpointCallback(self, value):
     def getCheckpointCallback(self):
         return self.getOrDefault(self.checkpoint_callback)
 
-    def setInMemoryCacheAll(self, value):
-        return self._set(inmemory_cache_all=value)
-
-    def getInMemoryCacheAll(self):
-        return self.getOrDefault(self.inmemory_cache_all)
-
     def setBackendEnv(self, value):
         self._set(backend_env=value)
 
diff --git a/horovod/spark/keras/remote.py b/horovod/spark/keras/remote.py
@@ -52,6 +52,7 @@ def RemoteTrainer(estimator, metadata, keras_utils, run_id, dataset_idx):
     user_verbose = estimator.getVerbose()
     checkpoint_callback = estimator.getCheckpointCallback()
     inmemory_cache_all = estimator.getInMemoryCacheAll()
+    should_use_gpu = estimator.getUseGpu()
 
     # Data reader parameters
     train_reader_worker_count = estimator.getTrainReaderNumWorker()
@@ -111,7 +112,16 @@ def train(serialized_model, train_rows, val_rows, avg_row_size):
         hvd = get_horovod()
         hvd.init()
 
-        pin_gpu(hvd, tf, k)
+        # Verbose mode 1 will print a progress bar
+        verbose = user_verbose if hvd.rank() == 0 else 0
+
+        if should_use_gpu:
+            if verbose:
+                print("Pinning current process to the GPU.")
+            pin_gpu(hvd, tf, k)
+        else:
+            if verbose:
+                print("Skip pinning current process to the GPU.")
 
         if random_seed is not None:
             if LooseVersion(tf.__version__) < LooseVersion('2.0.0'):
@@ -137,8 +147,6 @@ def train(serialized_model, train_rows, val_rows, avg_row_size):
         scaled_lr = k.backend.get_value(model.optimizer.lr) * hvd.size()
         k.backend.set_value(model.optimizer.lr, scaled_lr)
 
-        # Verbose mode 1 will print a progress bar
-        verbose = user_verbose if hvd.rank() == 0 else 0
 
         if verbose:
             print(f"Shared lib path is pointing to: {_horovod.common.process_sets._basics.MPI_LIB_CTYPES}")
diff --git a/horovod/spark/lightning/estimator.py b/horovod/spark/lightning/estimator.py
@@ -181,6 +181,7 @@ class TorchEstimator(HorovodEstimator, TorchEstimatorParamsWritable,
         debug_data_loader: (Optional)Debugging flag for data loader.
         train_async_data_loader_queue_size: (Optional) Size of train async data loader queue.
         val_async_data_loader_queue_size: (Optional) Size of val async data loader queue.
+        use_gpu: Whether to use the GPU for training. Defaults to True.
     """
 
     input_shapes = Param(Params._dummy(), 'input_shapes', 'input layer shapes')
@@ -189,10 +190,6 @@ class TorchEstimator(HorovodEstimator, TorchEstimatorParamsWritable,
     train_minibatch_fn = Param(Params._dummy(), 'train_minibatch_fn',
                                'functions that construct the minibatch train function for torch')
 
-    inmemory_cache_all = Param(Params._dummy(), 'inmemory_cache_all',
-                               'Cache the data in memory for training and validation.',
-                               typeConverter=TypeConverters.toBoolean)
-
     num_gpus = Param(Params._dummy(), 'num_gpus',
                      'Number of gpus per process, default to 1 when CUDA is available in the backend, otherwise 0.')
 
@@ -266,14 +263,14 @@ def __init__(self,
                  profiler=None,
                  debug_data_loader=False,
                  train_async_data_loader_queue_size=None,
-                 val_async_data_loader_queue_size=None):
+                 val_async_data_loader_queue_size=None,
+                 use_gpu=True):
 
         super(TorchEstimator, self).__init__()
         self._setDefault(loss_constructors=None,
                          input_shapes=None,
                          train_minibatch_fn=None,
                          transformation_fn=None,
-                         inmemory_cache_all=False,
                          num_gpus=None,
                          logger=None,
                          log_every_n_steps=50,
@@ -315,12 +312,6 @@ def setLossConstructors(self, value):
     def getLossConstructors(self):
         return self.getOrDefault(self.loss_constructors)
 
-    def setInMemoryCacheAll(self, value):
-        return self._set(inmemory_cache_all=value)
-
-    def getInMemoryCacheAll(self):
-        return self.getOrDefault(self.inmemory_cache_all)
-
     def setNumGPUs(self, value):
         return self._set(num_gpus=value)
 
diff --git a/horovod/spark/lightning/remote.py b/horovod/spark/lightning/remote.py
@@ -64,6 +64,7 @@ def RemoteTrainer(estimator, metadata, ckpt_bytes, run_id, dataset_idx, train_ro
     debug_data_loader = estimator.getDebugDataLoader()
     train_async_data_loader_queue_size = estimator.getTrainAsyncDataLoaderQueueSize()
     val_async_data_loader_queue_size = estimator.getValAsyncDataLoaderQueueSize()
+    should_use_gpu = estimator.getUseGpu()
 
     # get logger
     logger = estimator.getLogger()
@@ -194,7 +195,16 @@ def on_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -
                       f"Val rows: {val_rows}, Val batch size: {val_batch_size}, Val_steps_per_epoch: {_val_steps_per_epoch}\n"
                       f"Checkpoint file: {remote_store.checkpoint_path}, Logs dir: {remote_store.logs_path}\n")
 
+            if not should_use_gpu and verbose:
+                print("Skip pinning current process to the GPU.")
+
             cuda_available = torch.cuda.is_available()
+
+            if cuda_available and not should_use_gpu:
+                print("GPU is available but use_gpu is set to False."
+                      "Training will proceed without GPU support.")
+                cuda_available = False
+
             # We need to check all ranks have same device type for traning.
             # Horovod doesn't support heterogeneous allreduce for gradients.
             cuda_avail_list = hvd.allgather_object(cuda_available, name='device type')
diff --git a/horovod/spark/torch/estimator.py b/horovod/spark/torch/estimator.py
@@ -147,6 +147,8 @@ class TorchEstimator(HorovodEstimator, TorchEstimatorParamsWritable,
         val_reader_num_workers: Similar to the train_reader_num_workers.
         reader_pool_type: Type of worker pool used to parallelize reading data from the dataset.
                           Should be one of ['thread', 'process']. Defaults to 'process'.
+        inmemory_cache_all: (Optional) Cache the data in memory for training and validation.
+        use_gpu: Whether to use the GPU for training. Defaults to True.
     """
 
     input_shapes = Param(Params._dummy(), 'input_shapes', 'input layer shapes')
@@ -155,10 +157,6 @@ class TorchEstimator(HorovodEstimator, TorchEstimatorParamsWritable,
     train_minibatch_fn = Param(Params._dummy(), 'train_minibatch_fn',
                                'functions that construct the minibatch train function for torch')
 
-    inmemory_cache_all = Param(Params._dummy(), 'inmemory_cache_all',
-                               'Cache the data in memory for training and validation.',
-                               typeConverter=TypeConverters.toBoolean)
-
     @keyword_only
     def __init__(self,
                  num_proc=None,
@@ -193,14 +191,14 @@ def __init__(self,
                  val_reader_num_workers=None,
                  reader_pool_type=None,
                  label_shapes=None,
-                 inmemory_cache_all=False):
+                 inmemory_cache_all=False,
+                 use_gpu=True):
 
         super(TorchEstimator, self).__init__()
         self._setDefault(loss_constructors=None,
                          input_shapes=None,
                          train_minibatch_fn=None,
-                         transformation_fn=None,
-                         inmemory_cache_all=False)
+                         transformation_fn=None)
 
         kwargs = self._input_kwargs
 
@@ -227,12 +225,6 @@ def setLossConstructors(self, value):
     def getLossConstructors(self):
         return self.getOrDefault(self.loss_constructors)
 
-    def setInMemoryCacheAll(self, value):
-        return self._set(inmemory_cache_all=value)
-
-    def getInMemoryCacheAll(self):
-        return self.getOrDefault(self.inmemory_cache_all)
-
     def _get_optimizer(self):
         return self.getOrDefault(self.optimizer)
 
diff --git a/horovod/spark/torch/remote.py b/horovod/spark/torch/remote.py
@@ -60,6 +60,7 @@ def RemoteTrainer(estimator, metadata, last_checkpoint_state, run_id, dataset_id
     transformation_fn = estimator.getTransformationFn()
     transformation = transformation_fn if transformation_fn else None
     inmemory_cache_all = estimator.getInMemoryCacheAll()
+    should_use_gpu = estimator.getUseGpu()
 
     # If loss weight is not provided, use equal loss for all the labels
     loss_weights = estimator.getLossWeights()
@@ -134,7 +135,16 @@ def train(serialized_model, optimizer_cls, model_opt_state_serialized,
                 raise ValueError("user_shuffle_buffer_size cannot be negative!")
             shuffle_buffer_size = user_shuffle_buffer_size
 
+        if not should_use_gpu and user_verbose:
+            print("Skip pinning current process to the GPU.")
+
         cuda_available = torch.cuda.is_available()
+
+        if cuda_available and not should_use_gpu:
+            print("GPU is available but use_gpu is set to False."
+                  "Training will proceed without GPU support.")
+            cuda_available = False
+
         # We need to check all ranks have same device type for traning.
         # Horovod doesn't support heterogeneous allreduce for gradients.
         cuda_avail_list = hvd.allgather_object(cuda_available, name='device type')
diff --git a/test/integration/test_spark_keras.py b/test/integration/test_spark_keras.py
@@ -98,7 +98,10 @@ def test_fit_model(self):
                     batch_size=1,
                     random_seed=1,
                     epochs=3,
-                    verbose=2)
+                    verbose=2,
+                    use_gpu=False)
+
+                assert not keras_estimator.getUseGpu()
 
                 keras_model = keras_estimator.fit(df)