From af16de08650e8f0f663d5497c41405db6e2c5e4f Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Sun, 4 Jun 2023 16:52:11 -0700
Subject: [PATCH 01/10] Split gpu ci workflow into single-gpu and multi-gpu

---
 .github/workflows/gpu-ci.yml                  |  20 +-
 ...02-Multi-GPU-Tensorflow-with-Horovod.ipynb | 214 ++++++++++--------
 merlin/dataloader/utils/tf/tf_trainer.py      |   4 +-
 ...t_multi_GPU_with_horovod_and_tensorflow.py |  62 +++--
 tests/unit/dataloader/test_tf_dataloader.py   |   1 +
 tox.ini                                       |  26 ++-
 6 files changed, 206 insertions(+), 121 deletions(-)

diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index 9add26bd..cb7f9f09 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -12,8 +12,24 @@ on:
 
 jobs:
   build:
-    runs-on: 2GPU
+    runs-on: 1GPU
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - name: Run tests
+        run: |
+          ref_type=${{ github.ref_type }}
+          branch=main
+          if [[ $ref_type == "tag"* ]]
+          then
+            raw=$(git branch -r --contains ${{ github.ref_name }})
+            branch=${raw/origin\/}
+          fi
+          cd ${{ github.workspace }}; tox -r -e test-gpu -- $branch
 
+  multigpu:
+    runs-on: 2GPU
     steps:
       - uses: actions/checkout@v3
         with:
@@ -27,4 +43,4 @@ jobs:
             raw=$(git branch -r --contains ${{ github.ref_name }})
             branch=${raw/origin\/}
           fi
-          cd ${{ github.workspace }}; tox -e test-gpu -- $branch
+          cd ${{ github.workspace }}; tox -r -e test-gpu-multigpu -- $branch
diff --git a/examples/02-Multi-GPU-Tensorflow-with-Horovod.ipynb b/examples/02-Multi-GPU-Tensorflow-with-Horovod.ipynb
index 34776c6d..2f98b1bc 100644
--- a/examples/02-Multi-GPU-Tensorflow-with-Horovod.ipynb
+++ b/examples/02-Multi-GPU-Tensorflow-with-Horovod.ipynb
@@ -62,6 +62,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import os\n",
+    "\n",
     "from merlin.core.utils import download_file\n",
     "from merlin.core.dispatch import get_lib"
    ]
@@ -76,13 +78,13 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "downloading ml-25m.zip: 262MB [00:07, 36.0MB/s]                            \n",
-      "unzipping files: 100%|██████████| 8/8 [00:08<00:00,  1.08s/files]\n"
+      "downloading ml-25m.zip: 262MB [00:10, 24.4MB/s]                                                                                                                         \n",
+      "unzipping files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:04<00:00,  1.60files/s]\n"
      ]
     }
    ],
    "source": [
-    "DATA_PATH = '/workspace'\n",
+    "DATA_PATH = os.environ.get(\"DATA_PATH\", os.path.expanduser(\"~/workspace\"))\n",
     "download_file(\"http://files.grouplens.org/datasets/movielens/ml-25m.zip\", DATA_PATH + \"/ml-25m.zip\")"
    ]
   },
@@ -115,12 +117,9 @@
    "source": [
     "GPU_COUNT = 2  # specify how many GPUs you would like to train on\n",
     "\n",
-    "ratings = get_lib().read_csv(DATA_PATH + '/ml-25m/ratings.csv')\n",
+    "ratings = get_lib().read_csv(DATA_PATH + \"/ml-25m/ratings.csv\")\n",
     "\n",
-    "for i in range(GPU_COUNT):\n",
-    "    ratings[\n",
-    "        int(i * ratings.shape[0] / GPU_COUNT):int((i + 1) * ratings.shape[0] / GPU_COUNT)\n",
-    "    ].to_parquet(DATA_PATH + f'/train_{i}.parquet')"
+    "ratings.to_parquet(os.path.join(DATA_PATH, \"train.parquet\"))"
    ]
   },
   {
@@ -128,20 +127,15 @@
    "id": "8b9a65b4",
    "metadata": {},
    "source": [
-    "**Important: Individual parquet files require to have the same number of batches. If one worker has more batches than another, the training process will freeze. At one point during the training process, the worker with more batches waits for the gradients from the worker with fewer batches. But the worker with fewer batches finished the training run.**`\n",
-    "\n",
     "Let us now take a closer look at what else we will need to train with Horovod.\n",
     "\n",
     "### Write the training script to a file\n",
     "\n",
-    "We need to have a `.py` file we will be able to load into each process using `horovodrun`. \n",
+    "We need to have a `.py` file we will be able to load into each process using `horovodrun`.\n",
     "\n",
     "### Set `CUDA visible devices` correctly inside each process\n",
     "\n",
-    "We need to set the visible device in each process to its `rank`. This way process with `rank 0` will use the zeroth GPU, process with `rank 1` will use the first GPU, and so on. This ensures that each worker can access only a single GPU.\n",
-    "\n",
-    "Additionally, we will use the `rank` information to select the correct parquet file to load per worker (`DATA_PATH + f'/train_{hvd.local_rank()}.parquet'`)\n",
-    "\n"
+    "We need to set the visible device in each process to its `rank`. This way process with `rank 0` will use the zeroth GPU, process with `rank 1` will use the first GPU, and so on. This ensures that each worker can access only a single GPU."
    ]
   },
   {
@@ -159,7 +153,7 @@
     }
    ],
    "source": [
-    "%%writefile './tf_trainer.py'\n",
+    "%%writefile \"./tf_trainer.py\"\n",
     "\n",
     "import os\n",
     "\n",
@@ -176,20 +170,29 @@
     "from merlin.io import Dataset\n",
     "\n",
     "import tensorflow as tf\n",
-    "import horovod.tensorflow.keras as hvd\n",
+    "import horovod.tensorflow as hvd\n",
     "\n",
-    "from glob import glob\n",
     "from merlin.core.dispatch import get_lib\n",
+    "\n",
     "os.environ[\"TF_GPU_ALLOCATOR\"] = \"cuda_malloc_async\"\n",
     "\n",
     "hvd.init()\n",
     "\n",
     "from merlin.loader.tensorflow import Loader\n",
     "\n",
-    "DATA_PATH = '/workspace'\n",
     "\n",
-    "dataset = Dataset(glob(DATA_PATH + f'/train_{hvd.local_rank()}.parquet'), part_size=\"100MB\")\n",
-    "loader = Loader(dataset, batch_size=64 * 1024)\n",
+    "DATA_PATH = os.getenv(\"DATA_PATH\", os.path.expanduser(\"~/workspace\"))\n",
+    "\n",
+    "dataset = Dataset(os.path.join(DATA_PATH, \"train.parquet\"))\n",
+    "dataset = dataset.repartition(MPI_SIZE)\n",
+    "\n",
+    "loader = Loader(\n",
+    "    dataset,\n",
+    "    batch_size=64 * 1024,\n",
+    "    global_size=MPI_SIZE,\n",
+    "    global_rank=MPI_RANK,\n",
+    "    device=MPI_RANK,\n",
+    ")\n",
     "\n",
     "label_column = 'rating'\n",
     "\n",
@@ -206,8 +209,8 @@
     "class MatrixFactorization(tf.keras.Model):\n",
     "    def __init__(self, n_factors):\n",
     "        super().__init__()\n",
-    "        self.user_embeddings = tf.keras.layers.Embedding(162541, n_factors)\n",
-    "        self.movie_embeddings = tf.keras.layers.Embedding(209171, n_factors)\n",
+    "        self.user_embeddings = tf.keras.layers.Embedding(162542, n_factors)\n",
+    "        self.movie_embeddings = tf.keras.layers.Embedding(209172, n_factors)\n",
     "\n",
     "    def call(self, batch, training=False):\n",
     "        user_embs = self.user_embeddings(batch['userId'])\n",
@@ -218,16 +221,51 @@
     "\n",
     "\n",
     "model = MatrixFactorization(64)\n",
-    "opt = tf.keras.optimizers.Adam(1e-2 * hvd.size())\n",
-    "opt = hvd.DistributedOptimizer(opt)\n",
-    "model.compile(optimizer=opt, loss=tf.keras.losses.MeanSquaredError(), experimental_run_tf_function=False)\n",
-    "\n",
-    "model.fit(\n",
-    "    loader,\n",
-    "    epochs=1,\n",
-    "    callbacks=[hvd.callbacks.BroadcastGlobalVariablesCallback(0)],\n",
-    "    verbose=1 if hvd.rank() == 0 else 0\n",
-    ")"
+    "loss = tf.keras.losses.MeanSquaredError()\n",
+    "opt = tf.optimizers.Adam(1e-2 * hvd.size())\n",
+    "\n",
+    "checkpoint_prefix = \"./checkpoints\"\n",
+    "checkpoint = tf.train.Checkpoint(model=model, optimizer=opt)\n",
+    "\n",
+    "\n",
+    "@tf.function\n",
+    "def training_step(features, labels, first_batch):\n",
+    "    with tf.GradientTape() as tape:\n",
+    "        probs = model(features, training=True)\n",
+    "        loss_value = loss(labels, probs)\n",
+    "\n",
+    "    # Horovod: add Horovod Distributed GradientTape.\n",
+    "    tape = hvd.DistributedGradientTape(tape)\n",
+    "\n",
+    "    grads = tape.gradient(loss_value, model.trainable_variables)\n",
+    "    opt.apply_gradients(zip(grads, model.trainable_variables))\n",
+    "\n",
+    "    # Horovod: broadcast initial variable states from rank 0 to all other processes.\n",
+    "    # This is necessary to ensure consistent initialization of all workers when\n",
+    "    # training is started with random weights or restored from a checkpoint.\n",
+    "    #\n",
+    "    # Note: broadcast should be done after the first gradient step to ensure optimizer\n",
+    "    # initialization.\n",
+    "    if first_batch:\n",
+    "        hvd.broadcast_variables(model.variables, root_rank=0)\n",
+    "        hvd.broadcast_variables(opt.variables(), root_rank=0)\n",
+    "\n",
+    "    return loss_value\n",
+    "\n",
+    "\n",
+    "# Horovod: adjust number of steps based on number of GPUs.\n",
+    "for batch, (features, labels) in enumerate(loader):\n",
+    "    loss_value = training_step(features, labels, batch == 0)\n",
+    "\n",
+    "    if batch % 10 == 0 and hvd.rank() == 0:\n",
+    "        print('Step #%d\\tLoss: %.6f' % (batch, loss_value))\n",
+    "\n",
+    "hvd.join()\n",
+    "\n",
+    "# Horovod: save checkpoints only on worker 0 to prevent other workers from\n",
+    "# corrupting it.\n",
+    "if hvd.rank() == 0:\n",
+    "    checkpoint.save(checkpoint_prefix)"
    ]
   },
   {
@@ -252,69 +290,53 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[1,0]<stderr>:2022-12-08 06:58:30.501381: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX\n",
-      "[1,0]<stderr>:To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "[1,0]<stderr>:2022-12-08 06:58:30.555187: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:222] Using CUDA malloc Async allocator for GPU: 0\n",
-      "[1,0]<stderr>:2022-12-08 06:58:30.555454: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 16255 MB memory:  -> device: 0, name: Tesla V100-SXM2-32GB-LS, pci bus id: 0000:85:00.0, compute capability: 7.0\n",
-      "[1,1]<stderr>:2022-12-08 06:58:30.575717: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX\n",
-      "[1,1]<stderr>:To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "[1,1]<stderr>:2022-12-08 06:58:30.632564: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:222] Using CUDA malloc Async allocator for GPU: 0\n",
-      "[1,1]<stderr>:2022-12-08 06:58:30.632832: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 16255 MB memory:  -> device: 0, name: Tesla V100-SXM2-32GB-LS, pci bus id: 0000:86:00.0, compute capability: 7.0\n",
-      "[1,0]<stderr>:2022-12-08 06:58:35.010671: W tensorflow/core/common_runtime/forward_type_inference.cc:231] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:\n",
-      "[1,0]<stderr>:type_id: TFT_OPTIONAL\n",
-      "[1,0]<stderr>:args {\n",
-      "[1,0]<stderr>:  type_id: TFT_PRODUCT\n",
-      "[1,0]<stderr>:  args {\n",
-      "[1,0]<stderr>:    type_id: TFT_TENSOR\n",
-      "[1,0]<stderr>:    args {\n",
-      "[1,0]<stderr>:      type_id: TFT_BOOL\n",
-      "[1,0]<stderr>:    }\n",
-      "[1,0]<stderr>:  }\n",
-      "[1,0]<stderr>:}\n",
-      "[1,0]<stderr>: is neither a subtype nor a supertype of the combined inputs preceding it:\n",
-      "[1,0]<stderr>:type_id: TFT_OPTIONAL\n",
-      "[1,0]<stderr>:args {\n",
-      "[1,0]<stderr>:  type_id: TFT_PRODUCT\n",
-      "[1,0]<stderr>:  args {\n",
-      "[1,0]<stderr>:    type_id: TFT_TENSOR\n",
-      "[1,0]<stderr>:    args {\n",
-      "[1,0]<stderr>:      type_id: TFT_LEGACY_VARIANT\n",
-      "[1,0]<stderr>:    }\n",
-      "[1,0]<stderr>:  }\n",
-      "[1,0]<stderr>:}\n",
-      "[1,0]<stderr>:\n",
-      "[1,0]<stderr>:\twhile inferring type of node 'mean_squared_error/cond/output/_11'\n",
-      "[1,1]<stderr>:2022-12-08 06:58:35.218048: W tensorflow/core/common_runtime/forward_type_inference.cc:231] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:\n",
-      "[1,1]<stderr>:type_id: TFT_OPTIONAL\n",
-      "[1,1]<stderr>:args {\n",
-      "[1,1]<stderr>:  type_id: TFT_PRODUCT\n",
-      "[1,1]<stderr>:  args {\n",
-      "[1,1]<stderr>:    type_id: TFT_TENSOR\n",
-      "[1,1]<stderr>:    args {\n",
-      "[1,1]<stderr>:      type_id: TFT_BOOL\n",
-      "[1,1]<stderr>:    }\n",
-      "[1,1]<stderr>:  }\n",
-      "[1,1]<stderr>:}\n",
-      "[1,1]<stderr>: is neither a subtype nor a supertype of the combined inputs preceding it:\n",
-      "[1,1]<stderr>:type_id: TFT_OPTIONAL\n",
-      "[1,1]<stderr>:args {\n",
-      "[1,1]<stderr>:  type_id: TFT_PRODUCT\n",
-      "[1,1]<stderr>:  args {\n",
-      "[1,1]<stderr>:    type_id: TFT_TENSOR\n",
-      "[1,1]<stderr>:    args {\n",
-      "[1,1]<stderr>:      type_id: TFT_LEGACY_VARIANT\n",
-      "[1,1]<stderr>:    }\n",
-      "[1,1]<stderr>:  }\n",
-      "[1,1]<stderr>:}\n",
-      "[1,1]<stderr>:\n",
-      "[1,1]<stderr>:\twhile inferring type of node 'mean_squared_error/cond/output/_11'\n",
-      "  6/191 [..............................] - ETA: 2s - loss: 13.6433   [1,0]<stderr>:/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (1.26.12) or chardet (3.0.4) doesn't match a supported version!\n",
-      "[1,0]<stderr>:  warnings.warn(\"urllib3 ({}) or chardet ({}) doesn't match a supported \"\n",
-      "[1,0]<stderr>:WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0094s vs `on_train_batch_end` time: 0.1490s). Check your callbacks.\n",
-      "[1,1]<stderr>:/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (1.26.12) or chardet (3.0.4) doesn't match a supported version!\n",
-      "[1,1]<stderr>:  warnings.warn(\"urllib3 ({}) or chardet ({}) doesn't match a supported \"\n",
-      "[1,1]<stderr>:WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0093s vs `on_train_batch_end` time: 0.1489s). Check your callbacks.\n",
-      "191/191 [==============================] - 8s 12ms/step - loss: 3.3301<stdout>[1,0]<stdout[1,0]<stdout[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>\n"
+      "2023-06-03 21:35:18.892140: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2023-06-03 21:35:18.932879: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "[1,1]<stderr>:2023-06-03 21:35:23.549563: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "[1,0]<stderr>:2023-06-03 21:35:23.568539: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "[1,1]<stderr>:2023-06-03 21:35:23.592349: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "[1,1]<stderr>:To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "[1,0]<stderr>:2023-06-03 21:35:23.609861: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "[1,0]<stderr>:To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "[1,1]<stderr>:2023-06-03 21:35:28.092241: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:226] Using CUDA malloc Async allocator for GPU: 0\n",
+      "[1,1]<stderr>:2023-06-03 21:35:28.092336: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1638] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24337 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:b3:00.0, compute capability: 8.6\n",
+      "[1,0]<stderr>:2023-06-03 21:35:28.141988: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:226] Using CUDA malloc Async allocator for GPU: 0\n",
+      "[1,0]<stderr>:2023-06-03 21:35:28.142076: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1638] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24338 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:17:00.0, compute capability: 8.6\n",
+      "[1,0]<stderr>:2023-06-03 21:35:32.089463: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7f1150020480 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:\n",
+      "[1,0]<stderr>:2023-06-03 21:35:32.089532: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA RTX A6000, Compute Capability 8.6\n",
+      "[1,1]<stderr>:2023-06-03 21:35:32.089552: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x79f8590 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:\n",
+      "[1,1]<stderr>:2023-06-03 21:35:32.089613: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA RTX A6000, Compute Capability 8.6\n",
+      "[1,1]<stderr>:2023-06-03 21:35:32.101885: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
+      "[1,0]<stderr>:2023-06-03 21:35:32.102268: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
+      "[1,1]<stderr>:2023-06-03 21:35:33.637854: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8900\n",
+      "[1,0]<stderr>:2023-06-03 21:35:33.648275: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8900\n",
+      "[1,1]<stderr>:2023-06-03 21:35:33.834015: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.\n",
+      "[1,0]<stderr>:2023-06-03 21:35:33.854743: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.\n",
+      "[1,0]<stdout>:Step #0\tLoss: 13.976147\n",
+      "[1,0]<stdout>:Step #10\tLoss: 13.746956\n",
+      "[1,0]<stdout>:Step #20\tLoss: 13.907515\n",
+      "[1,0]<stdout>:Step #30\tLoss: 14.084653\n",
+      "[1,0]<stdout>:Step #40\tLoss: 13.346972\n",
+      "[1,0]<stdout>:Step #50\tLoss: 13.931261\n",
+      "[1,0]<stdout>:Step #60\tLoss: 13.707795\n",
+      "[1,0]<stdout>:Step #70\tLoss: 13.510033\n",
+      "[1,0]<stdout>:Step #80\tLoss: 13.372274\n",
+      "[1,0]<stdout>:Step #90\tLoss: 13.713926\n",
+      "[1,0]<stdout>:Step #100\tLoss: 13.236437\n",
+      "[1,0]<stdout>:Step #110\tLoss: 13.265822\n",
+      "[1,0]<stdout>:Step #120\tLoss: 13.991277\n",
+      "[1,0]<stdout>:Step #130\tLoss: 14.069466\n",
+      "[1,0]<stdout>:Step #140\tLoss: 13.635876\n",
+      "[1,0]<stdout>:Step #150\tLoss: 13.416016\n",
+      "[1,0]<stdout>:Step #160\tLoss: 13.216636\n",
+      "[1,0]<stdout>:Step #170\tLoss: 12.776440\n",
+      "[1,0]<stdout>:Step #180\tLoss: 13.570569\n",
+      "[1,0]<stdout>:Step #190\tLoss: 13.868576\n",
+      "[1,1]<stderr>:/usr/local/lib/python3.8/dist-packages/merlin/dtypes/mappings/torch.py:43: UserWarning: PyTorch dtype mappings did not load successfully due to an error: No module named 'torch'\n",
+      "[1,1]<stderr>:  warn(f\"PyTorch dtype mappings did not load successfully due to an error: {exc.msg}\")\n",
+      "[1,0]<stderr>:/usr/local/lib/python3.8/dist-packages/merlin/dtypes/mappings/torch.py:43: UserWarning: PyTorch dtype mappings did not load successfully due to an error: No module named 'torch'\n",
+      "[1,0]<stderr>:  warn(f\"PyTorch dtype mappings did not load successfully due to an error: {exc.msg}\")\n"
      ]
     }
    ],
diff --git a/merlin/dataloader/utils/tf/tf_trainer.py b/merlin/dataloader/utils/tf/tf_trainer.py
index de16f91c..419d7301 100644
--- a/merlin/dataloader/utils/tf/tf_trainer.py
+++ b/merlin/dataloader/utils/tf/tf_trainer.py
@@ -76,7 +76,7 @@ def seed_fn():
 EMBEDDING_TABLE_SHAPES, MH_EMBEDDING_TABLE_SHAPES = nvt.ops.get_embedding_sizes(proc)
 EMBEDDING_TABLE_SHAPES.update(MH_EMBEDDING_TABLE_SHAPES)
 
-ds = Dataset(TRAIN_PATHS, engine="parquet", part_mem_frac=0.06)
+ds = Dataset(TRAIN_PATHS, engine="parquet")
 train_dataset_tf = Loader(
     ds,  # you could also use a glob pattern
     batch_size=BATCH_SIZE,
@@ -112,7 +112,7 @@ def seed_fn():
 x = tf.keras.layers.Dense(1, activation="sigmoid")(x)
 model = tf.keras.Model(inputs=inputs, outputs=x)
 loss = tf.losses.BinaryCrossentropy()
-opt = tf.keras.optimizers.SGD(0.01 * hvd.size())
+opt = tf.keras.optimizers.legacy.SGD(0.01 * hvd.size())
 opt = hvd.DistributedOptimizer(opt)
 checkpoint_dir = "./checkpoints"
 checkpoint = tf.train.Checkpoint(model=model, optimizer=opt)
diff --git a/tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py b/tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py
index 50c0e33f..9a58b6db 100644
--- a/tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py
+++ b/tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py
@@ -1,4 +1,8 @@
 import os
+import subprocess
+
+import numpy as np
+import pandas as pd
 import pytest
 from testbook import testbook
 
@@ -7,22 +11,50 @@
 pytestmark = pytest.mark.tensorflow
 
 
-@testbook("examples/02-Multi-GPU-Tensorflow-with-Horovod.ipynb", execute=False)
-def test_getting_started_tensorflow(tb):
+@pytest.mark.multigpu
+@testbook("examples/02-Multi-GPU-Tensorflow-with-Horovod.ipynb", execute=False, timeout=120)
+def test_getting_started_tensorflow(tb, tmpdir):
+    ml_25m_dir = tmpdir / "ml-25"
+    ml_25m_dir.mkdir()
+    ratings_path = ml_25m_dir / "ratings.csv"
+    pd.DataFrame(
+        {
+            "userId": np.random.randint(0, 10, 100_000),
+            "movieId": np.random.randint(0, 10, 100_000),
+            "rating": np.random.randint(0, 5, 100_000).astype(np.float32),
+        }
+    ).to_csv(ratings_path, index=False)
+
     tb.inject(
-        """
-        import pandas as pd
-        import numpy as np
-
-        !mkdir -p /tmp/ml-25m
-        pd.DataFrame({
-            'userId': np.random.randint(0, 10, 100_000),
-            'movieId': np.random.randint(0, 10, 100_000),
-            'rating': np.random.randint(0, 5, 100_000).astype(np.float32)
-        }).to_csv('/tmp/ml-25m/ratings.csv', index=False)
+        f"""
+        import os
+        os.environ["DATA_PATH"] = "{str(tmpdir)}"
         """
     )
-    tb.cells[4].source = "DATA_PATH = '/tmp'"
-    tb.cells[7].source.replace("GPU_COUNT = 2", "GPU_COUNT = 1")
+
     tb.execute()
-    os.system("horovodrun -np 1 python tf_trainer.py")
+
+    curr_path = os.path.abspath(__file__)
+    repo_root = os.path.relpath(os.path.normpath(os.path.join(curr_path, "../../..")))
+    hvd_wrap_path = os.path.join(repo_root, "merlin/dataloader/utils/tf/hvd_wrapper.sh")
+    with subprocess.Popen(
+        [
+            "horovodrun",
+            "-np",
+            "2",
+            "-H",
+            "localhost:2",
+            "sh",
+            hvd_wrap_path,
+            "python",
+            "tf_trainer.py",
+        ],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    ) as process:
+        process.wait()
+        stdout, stderr = process.communicate()
+        print(stdout, stderr)
+        assert "Loss" in str(stdout)
+
+    assert any(f.startswith("checkpoints-") for f in os.listdir(os.getcwd()))
diff --git a/tests/unit/dataloader/test_tf_dataloader.py b/tests/unit/dataloader/test_tf_dataloader.py
index 10443897..f8cad966 100644
--- a/tests/unit/dataloader/test_tf_dataloader.py
+++ b/tests/unit/dataloader/test_tf_dataloader.py
@@ -535,6 +535,7 @@ def test_multigpu_partitioning(dataset, batch_size, global_rank):
     assert indices == [global_rank]
 
 
+@pytest.mark.multigpu
 @pytest.mark.skipif(
     os.environ.get("NR_USER") is not None,
     reason="not working correctly in ci environment",
diff --git a/tox.ini b/tox.ini
index afe62a2c..1cb98629 100644
--- a/tox.ini
+++ b/tox.ini
@@ -66,12 +66,8 @@ passenv =
     NR_USER
     CUDA_VISIBLE_DEVICES
 sitepackages=true
-; Runs in: Internal Jenkins
+; Runs in: 1GPU Github Actions runners.
 ; Runs GPU-based tests.
-; The jenkins jobs run on an image based on merlin-hugectr. This will include all cudf configuration
-; and other gpu-specific libraries that we can enxpect will always exist. Thus, we don't need
-; to install requirements.txt yet. As we get better at python environment isolation, we will
-; need to add some back.
 deps =
     -rrequirements/dev.txt
     pytest
@@ -80,7 +76,25 @@ commands =
     python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/core.git
     python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/nvtabular.git
 
-    python -m pytest --cov-report term --cov merlin -rxs tests/unit tests/examples
+    python -m pytest -m "singlegpu or not multigpu" --cov-report term --cov merlin -rxs tests/unit/ tests/examples/
+
+[testenv:test-gpu-multigpu]
+passenv =
+    CUDA_VISIBLE_DEVICES
+    OPAL_PREFIX
+    NR_USER
+sitepackages=true
+; Runs in: 1GPU Github Actions runners.
+; Runs GPU-based tests.
+deps =
+    -rrequirements/dev.txt
+    pytest
+    pytest-cov
+commands =
+    python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/core.git
+    python -m pip install --upgrade git+https://github.com/NVIDIA-Merlin/nvtabular.git
+
+    python -m pytest -m "multigpu" --cov-report term --cov merlin -rxs tests/unit/ tests/examples/
 
 [testenv:test-models-cpu]
 passenv=GIT_COMMIT

From b8e96bef8bfe9b04dec684abd8de5f2f5b7c9f7a Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Sun, 4 Jun 2023 17:41:42 -0700
Subject: [PATCH 02/10] skip multi gpu example test if horovod is not available

---
 tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py b/tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py
index 9a58b6db..a92015f1 100644
--- a/tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py
+++ b/tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py
@@ -7,6 +7,7 @@
 from testbook import testbook
 
 pytest.importorskip("tensorflow")
+pytest.importorskip("horovod")
 
 pytestmark = pytest.mark.tensorflow
 

From 5f214a8241a58fc01ee6752674fc98adb6e21c6e Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Mon, 5 Jun 2023 21:41:55 -0700
Subject: [PATCH 03/10] add pytest.ini

---
 pytest.ini | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 pytest.ini

diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 00000000..3b9940df
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,4 @@
+[pytest]
+markers =
+    multigpu: Tests only run in multiple-GPU environments
+    singlegpu: Optional marker to run tests in single-GPU environments. Usually used when running in both single- and multi-GPU.

From 1be43a3c25464b3b7c6e57c2958e99c23cf859d2 Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Wed, 7 Jun 2023 20:25:10 -0700
Subject: [PATCH 04/10] update tf_trainer in horovod unit test

---
 merlin/dataloader/utils/tf/tf_trainer.py | 31 ++++++++++++------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/merlin/dataloader/utils/tf/tf_trainer.py b/merlin/dataloader/utils/tf/tf_trainer.py
index 419d7301..446259ad 100644
--- a/merlin/dataloader/utils/tf/tf_trainer.py
+++ b/merlin/dataloader/utils/tf/tf_trainer.py
@@ -6,6 +6,9 @@
 
 from merlin.core.compat import cupy, numpy
 from merlin.io import Dataset
+from nvtabular.ops import TagAsItemID, TagAsUserID, JoinExternal
+from merlin.core.dispatch import get_lib
+from merlin.schema import Tags
 
 # we can control how much memory to give tensorflow with this environment variable
 # IMPORTANT: make sure you do this before you initialize TF's runtime, otherwise
@@ -38,11 +41,8 @@
 BASE_DIR = args.dir_in or "./data/"
 BATCH_SIZE = int(args.batch_size or 16384)  # Batch Size
 CATEGORICAL_COLUMNS = args.cats or ["movieId", "userId"]  # Single-hot
-CATEGORICAL_MH_COLUMNS = args.cats_mh or ["genres"]  # Multi-hot
-NUMERIC_COLUMNS = args.conts or []
-TRAIN_PATHS = sorted(
-    glob.glob(os.path.join(BASE_DIR, "train/*.parquet"))
-)  # Output from ETL-with-NVTabular
+LABEL_COLUMNS = ["rating"]
+
 hvd.init()
 
 # Seed with system randomness (or a static seed)
@@ -76,26 +76,25 @@ def seed_fn():
 EMBEDDING_TABLE_SHAPES, MH_EMBEDDING_TABLE_SHAPES = nvt.ops.get_embedding_sizes(proc)
 EMBEDDING_TABLE_SHAPES.update(MH_EMBEDDING_TABLE_SHAPES)
 
-ds = Dataset(TRAIN_PATHS, engine="parquet")
-train_dataset_tf = Loader(
-    ds,  # you could also use a glob pattern
+train_ds = nvt.Dataset(f'{BASE_DIR}/train', engine='parquet', dtypes={'rating': xp.int8})
+train_ds.schema = train_ds.schema.remove_col('genres')
+
+target_column = train_ds.schema.select_by_tag(Tags.TARGET).column_names[0]
+
+train_loader = Loader(
+    train_ds,
     batch_size=BATCH_SIZE,
     shuffle=True,
     seed_fn=seed_fn,
     global_size=hvd.size(),
     global_rank=hvd.rank(),
 )
+
 inputs = {}  # tf.keras.Input placeholders for each feature to be used
 emb_layers = []  # output of all embedding layers, which will be concatenated
 for col in CATEGORICAL_COLUMNS:
     inputs[col] = tf.keras.Input(name=col, dtype=tf.int32, shape=(1,))
-# Note that we need two input tensors for multi-hot categorical features
-for col in CATEGORICAL_MH_COLUMNS:
-    inputs[col] = (
-        tf.keras.Input(name=f"{col}__values", dtype=tf.int64, shape=(1,)),
-        tf.keras.Input(name=f"{col}__lengths", dtype=tf.int64, shape=(1,)),
-    )
-for col in CATEGORICAL_COLUMNS + CATEGORICAL_MH_COLUMNS:
+for col in CATEGORICAL_COLUMNS:
     emb_layers.append(
         tf.feature_column.embedding_column(
             tf.feature_column.categorical_column_with_identity(
@@ -140,7 +139,7 @@ def training_step(examples, labels, first_batch):
 
 
 # Horovod: adjust number of steps based on number of GPUs.
-for batch, (examples, labels) in enumerate(train_dataset_tf):
+for batch, (examples, labels) in enumerate(train_loader):
     loss_value = training_step(examples, labels, batch == 0)
     if batch % 100 == 0 and hvd.local_rank() == 0:
         print(f"Step #{batch}\tLoss: {loss_value:.6f}")

From 078822d1d4214370380a208858c0a2625c61398a Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Wed, 7 Jun 2023 21:10:02 -0700
Subject: [PATCH 05/10] Fix data path in multi gpu notebook

---
 ...02-Multi-GPU-Tensorflow-with-Horovod.ipynb | 103 +++++++++---------
 ...t_multi_GPU_with_horovod_and_tensorflow.py |   2 +-
 2 files changed, 51 insertions(+), 54 deletions(-)

diff --git a/examples/02-Multi-GPU-Tensorflow-with-Horovod.ipynb b/examples/02-Multi-GPU-Tensorflow-with-Horovod.ipynb
index 2f98b1bc..a34f0282 100644
--- a/examples/02-Multi-GPU-Tensorflow-with-Horovod.ipynb
+++ b/examples/02-Multi-GPU-Tensorflow-with-Horovod.ipynb
@@ -78,8 +78,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "downloading ml-25m.zip: 262MB [00:10, 24.4MB/s]                                                                                                                         \n",
-      "unzipping files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:04<00:00,  1.60files/s]\n"
+      "downloading ml-25m.zip: 262MB [00:10, 24.4MB/s]                                     \n",
+      "unzipping files: 100%|█████████████████████████████| 8/8 [00:04<00:00,  1.61files/s]\n"
      ]
     }
    ],
@@ -155,6 +155,7 @@
    "source": [
     "%%writefile \"./tf_trainer.py\"\n",
     "\n",
+    "import argparse\n",
     "import os\n",
     "\n",
     "# the order of statements and imports is imoportant\n",
@@ -180,15 +181,20 @@
     "\n",
     "from merlin.loader.tensorflow import Loader\n",
     "\n",
+    "parser = argparse.ArgumentParser()\n",
+    "parser.add_argument(\"--data_path\", default=None, help=\"Input directory.\")\n",
+    "parser.add_argument(\"--batch_size\", default=None, help=\"Batch size.\")\n",
+    "args = parser.parse_args()\n",
     "\n",
-    "DATA_PATH = os.getenv(\"DATA_PATH\", os.path.expanduser(\"~/workspace\"))\n",
+    "DATA_PATH = args.data_path or os.path.expanduser(\"~/workspace\")\n",
+    "BATCH_SIZE = args.batch_size or 1024\n",
     "\n",
     "dataset = Dataset(os.path.join(DATA_PATH, \"train.parquet\"))\n",
     "dataset = dataset.repartition(MPI_SIZE)\n",
     "\n",
     "loader = Loader(\n",
     "    dataset,\n",
-    "    batch_size=64 * 1024,\n",
+    "    batch_size=BATCH_SIZE,\n",
     "    global_size=MPI_SIZE,\n",
     "    global_rank=MPI_RANK,\n",
     "    device=MPI_RANK,\n",
@@ -280,7 +286,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "ec5e9b7f",
    "metadata": {
     "scrolled": true
@@ -290,58 +296,49 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2023-06-03 21:35:18.892140: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "2023-06-03 21:35:18.932879: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
-      "To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "[1,1]<stderr>:2023-06-03 21:35:23.549563: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "[1,0]<stderr>:2023-06-03 21:35:23.568539: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "[1,1]<stderr>:2023-06-03 21:35:23.592349: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
-      "[1,1]<stderr>:To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "[1,0]<stderr>:2023-06-03 21:35:23.609861: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
-      "[1,0]<stderr>:To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "[1,1]<stderr>:2023-06-03 21:35:28.092241: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:226] Using CUDA malloc Async allocator for GPU: 0\n",
-      "[1,1]<stderr>:2023-06-03 21:35:28.092336: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1638] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24337 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:b3:00.0, compute capability: 8.6\n",
-      "[1,0]<stderr>:2023-06-03 21:35:28.141988: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:226] Using CUDA malloc Async allocator for GPU: 0\n",
-      "[1,0]<stderr>:2023-06-03 21:35:28.142076: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1638] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24338 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:17:00.0, compute capability: 8.6\n",
-      "[1,0]<stderr>:2023-06-03 21:35:32.089463: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7f1150020480 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:\n",
-      "[1,0]<stderr>:2023-06-03 21:35:32.089532: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA RTX A6000, Compute Capability 8.6\n",
-      "[1,1]<stderr>:2023-06-03 21:35:32.089552: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x79f8590 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:\n",
-      "[1,1]<stderr>:2023-06-03 21:35:32.089613: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA RTX A6000, Compute Capability 8.6\n",
-      "[1,1]<stderr>:2023-06-03 21:35:32.101885: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
-      "[1,0]<stderr>:2023-06-03 21:35:32.102268: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
-      "[1,1]<stderr>:2023-06-03 21:35:33.637854: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8900\n",
-      "[1,0]<stderr>:2023-06-03 21:35:33.648275: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8900\n",
-      "[1,1]<stderr>:2023-06-03 21:35:33.834015: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.\n",
-      "[1,0]<stderr>:2023-06-03 21:35:33.854743: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.\n",
-      "[1,0]<stdout>:Step #0\tLoss: 13.976147\n",
-      "[1,0]<stdout>:Step #10\tLoss: 13.746956\n",
-      "[1,0]<stdout>:Step #20\tLoss: 13.907515\n",
-      "[1,0]<stdout>:Step #30\tLoss: 14.084653\n",
-      "[1,0]<stdout>:Step #40\tLoss: 13.346972\n",
-      "[1,0]<stdout>:Step #50\tLoss: 13.931261\n",
-      "[1,0]<stdout>:Step #60\tLoss: 13.707795\n",
-      "[1,0]<stdout>:Step #70\tLoss: 13.510033\n",
-      "[1,0]<stdout>:Step #80\tLoss: 13.372274\n",
-      "[1,0]<stdout>:Step #90\tLoss: 13.713926\n",
-      "[1,0]<stdout>:Step #100\tLoss: 13.236437\n",
-      "[1,0]<stdout>:Step #110\tLoss: 13.265822\n",
-      "[1,0]<stdout>:Step #120\tLoss: 13.991277\n",
-      "[1,0]<stdout>:Step #130\tLoss: 14.069466\n",
-      "[1,0]<stdout>:Step #140\tLoss: 13.635876\n",
-      "[1,0]<stdout>:Step #150\tLoss: 13.416016\n",
-      "[1,0]<stdout>:Step #160\tLoss: 13.216636\n",
-      "[1,0]<stdout>:Step #170\tLoss: 12.776440\n",
-      "[1,0]<stdout>:Step #180\tLoss: 13.570569\n",
-      "[1,0]<stdout>:Step #190\tLoss: 13.868576\n",
-      "[1,1]<stderr>:/usr/local/lib/python3.8/dist-packages/merlin/dtypes/mappings/torch.py:43: UserWarning: PyTorch dtype mappings did not load successfully due to an error: No module named 'torch'\n",
-      "[1,1]<stderr>:  warn(f\"PyTorch dtype mappings did not load successfully due to an error: {exc.msg}\")\n",
-      "[1,0]<stderr>:/usr/local/lib/python3.8/dist-packages/merlin/dtypes/mappings/torch.py:43: UserWarning: PyTorch dtype mappings did not load successfully due to an error: No module named 'torch'\n",
-      "[1,0]<stderr>:  warn(f\"PyTorch dtype mappings did not load successfully due to an error: {exc.msg}\")\n"
+      "2023-06-08 04:04:31.525132: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX\n",
+      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2023-06-08 04:04:31.640485: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "[1,0]<stderr>:2023-06-08 04:04:34.931845: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX\n",
+      "[1,0]<stderr>:To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "[1,1]<stderr>:2023-06-08 04:04:34.939388: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX\n",
+      "[1,1]<stderr>:To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "[1,1]<stderr>:2023-06-08 04:04:35.046788: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "[1,0]<stderr>:2023-06-08 04:04:35.046788: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "[1,1]<stderr>:2023-06-08 04:04:41.286722: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:222] Using CUDA malloc Async allocator for GPU: 0\n",
+      "[1,1]<stderr>:2023-06-08 04:04:41.286821: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24570 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:b3:00.0, compute capability: 8.6\n",
+      "[1,0]<stderr>:2023-06-08 04:04:41.292086: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:222] Using CUDA malloc Async allocator for GPU: 0\n",
+      "[1,0]<stderr>:2023-06-08 04:04:41.292173: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24570 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:17:00.0, compute capability: 8.6\n",
+      "[1,1]<stderr>:WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/autograph/pyct/static_analysis/liveness.py:83: Analyzer.lamba_check (from tensorflow.python.autograph.pyct.static_analysis.liveness) is deprecated and will be removed after 2023-09-23.\n",
+      "[1,1]<stderr>:Instructions for updating:\n",
+      "[1,1]<stderr>:Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089\n",
+      "[1,0]<stderr>:WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/autograph/pyct/static_analysis/liveness.py:83: Analyzer.lamba_check (from tensorflow.python.autograph.pyct.static_analysis.liveness) is deprecated and will be removed after 2023-09-23.\n",
+      "[1,0]<stderr>:Instructions for updating:\n",
+      "[1,0]<stderr>:Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089\n",
+      "[1,0]<stdout>:Step #0\tLoss: 13.976286\n",
+      "[1,0]<stdout>:Step #10\tLoss: 13.746111\n",
+      "[1,0]<stdout>:Step #20\tLoss: 13.905323\n",
+      "[1,0]<stdout>:Step #30\tLoss: 14.093473\n",
+      "[1,0]<stdout>:Step #40\tLoss: 13.336206\n",
+      "[1,0]<stdout>:Step #50\tLoss: 13.932583\n",
+      "[1,0]<stdout>:Step #60\tLoss: 13.702780\n",
+      "[1,0]<stdout>:Step #70\tLoss: 13.522057\n",
+      "[1,0]<stdout>:Step #80\tLoss: 13.382860\n",
+      "[1,0]<stdout>:Step #90\tLoss: 13.701270\n",
+      "[1,0]<stdout>:Step #100\tLoss: 13.240610\n",
+      "[1,0]<stdout>:Step #110\tLoss: 13.264977\n",
+      "[1,0]<stdout>:Step #120\tLoss: 13.984927\n",
+      "[1,0]<stdout>:Step #130\tLoss: 14.039978\n",
+      "[1,0]<stdout>:Step #140\tLoss: 13.639907\n",
+      "[1,0]<stdout>:Step #150\tLoss: 13.430090\n",
+      "[1,0]<stdout>:Step #160\tLoss: 13.219415\n",
+      "[1,0]<stdout>:Step #170\tLoss: 12.758451\n",
+      "[1,0]<stdout>:Step #180\tLoss: 13.592442\n"
      ]
     }
    ],
    "source": [
-    "!horovodrun -np {GPU_COUNT} python tf_trainer.py"
+    "!horovodrun -np {GPU_COUNT} python tf_trainer.py --data_path={DATA_PATH} --batch_size=65536"
    ]
   },
   {
diff --git a/tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py b/tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py
index a92015f1..2605dc99 100644
--- a/tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py
+++ b/tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py
@@ -56,6 +56,6 @@ def test_getting_started_tensorflow(tb, tmpdir):
         process.wait()
         stdout, stderr = process.communicate()
         print(stdout, stderr)
-        assert "Loss" in str(stdout)
+        assert "Loss:" in str(stdout)
 
     assert any(f.startswith("checkpoints-") for f in os.listdir(os.getcwd()))

From 074a33e80f1ef976c86f9b0384508c1d09abfa17 Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Wed, 7 Jun 2023 21:15:00 -0700
Subject: [PATCH 06/10] lint

---
 merlin/dataloader/utils/tf/tf_trainer.py    | 8 ++------
 tests/unit/dataloader/test_tf_dataloader.py | 6 +-----
 tox.ini                                     | 1 -
 3 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/merlin/dataloader/utils/tf/tf_trainer.py b/merlin/dataloader/utils/tf/tf_trainer.py
index 446259ad..9b5df70a 100644
--- a/merlin/dataloader/utils/tf/tf_trainer.py
+++ b/merlin/dataloader/utils/tf/tf_trainer.py
@@ -1,13 +1,9 @@
 # External dependencies
 import argparse
-import glob
 import logging
 import os
 
 from merlin.core.compat import cupy, numpy
-from merlin.io import Dataset
-from nvtabular.ops import TagAsItemID, TagAsUserID, JoinExternal
-from merlin.core.dispatch import get_lib
 from merlin.schema import Tags
 
 # we can control how much memory to give tensorflow with this environment variable
@@ -76,8 +72,8 @@ def seed_fn():
 EMBEDDING_TABLE_SHAPES, MH_EMBEDDING_TABLE_SHAPES = nvt.ops.get_embedding_sizes(proc)
 EMBEDDING_TABLE_SHAPES.update(MH_EMBEDDING_TABLE_SHAPES)
 
-train_ds = nvt.Dataset(f'{BASE_DIR}/train', engine='parquet', dtypes={'rating': xp.int8})
-train_ds.schema = train_ds.schema.remove_col('genres')
+train_ds = nvt.Dataset(f"{BASE_DIR}/train", engine="parquet", dtypes={"rating": xp.int8})
+train_ds.schema = train_ds.schema.remove_col("genres")
 
 target_column = train_ds.schema.select_by_tag(Tags.TARGET).column_names[0]
 
diff --git a/tests/unit/dataloader/test_tf_dataloader.py b/tests/unit/dataloader/test_tf_dataloader.py
index f8cad966..8cdef6a0 100644
--- a/tests/unit/dataloader/test_tf_dataloader.py
+++ b/tests/unit/dataloader/test_tf_dataloader.py
@@ -536,10 +536,6 @@ def test_multigpu_partitioning(dataset, batch_size, global_rank):
 
 
 @pytest.mark.multigpu
-@pytest.mark.skipif(
-    os.environ.get("NR_USER") is not None,
-    reason="not working correctly in ci environment",
-)
 @pytest.mark.skipif(importlib.util.find_spec("horovod") is None, reason="needs horovod")
 @pytest.mark.skipif(
     HAS_GPU and cupy and cupy.cuda.runtime.getDeviceCount() <= 1,
@@ -722,7 +718,7 @@ def test_wrong_batch_size_raises_warning():
             _ = tf_loader(dataset, batch_size=batch_size)
 
     for power in range(4, 10):
-        batch_size = 2**power
+        batch_size = 2 ** power
         # warning not raised for power of two
         with warnings.catch_warnings():
             warnings.simplefilter("error")
diff --git a/tox.ini b/tox.ini
index ab3a8477..a5950690 100644
--- a/tox.ini
+++ b/tox.ini
@@ -82,7 +82,6 @@ commands =
 passenv =
     CUDA_VISIBLE_DEVICES
     OPAL_PREFIX
-    NR_USER
 sitepackages=true
 ; Runs in: 1GPU Github Actions runners.
 ; Runs GPU-based tests.

From 8239008cbec12656b2e39f191fc1448bfa32ea5f Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Wed, 7 Jun 2023 23:09:44 -0700
Subject: [PATCH 07/10] lint

---
 ...02-Multi-GPU-Tensorflow-with-Horovod.ipynb | 82 +++----------------
 tests/unit/dataloader/test_tf_dataloader.py   |  2 +-
 2 files changed, 11 insertions(+), 73 deletions(-)

diff --git a/examples/02-Multi-GPU-Tensorflow-with-Horovod.ipynb b/examples/02-Multi-GPU-Tensorflow-with-Horovod.ipynb
index a34f0282..208b1b8e 100644
--- a/examples/02-Multi-GPU-Tensorflow-with-Horovod.ipynb
+++ b/examples/02-Multi-GPU-Tensorflow-with-Horovod.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "bb28e271",
    "metadata": {},
    "outputs": [],
@@ -57,7 +57,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "edd46306",
    "metadata": {},
    "outputs": [],
@@ -70,19 +70,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "591f8c61",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "downloading ml-25m.zip: 262MB [00:10, 24.4MB/s]                                     \n",
-      "unzipping files: 100%|█████████████████████████████| 8/8 [00:04<00:00,  1.61files/s]\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "DATA_PATH = os.environ.get(\"DATA_PATH\", os.path.expanduser(\"~/workspace\"))\n",
     "download_file(\"http://files.grouplens.org/datasets/movielens/ml-25m.zip\", DATA_PATH + \"/ml-25m.zip\")"
@@ -110,7 +101,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "c65e5ef6",
    "metadata": {},
    "outputs": [],
@@ -140,18 +131,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "9fbe17a7",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Overwriting ./tf_trainer.py\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "%%writefile \"./tf_trainer.py\"\n",
     "\n",
@@ -183,7 +166,7 @@
     "\n",
     "parser = argparse.ArgumentParser()\n",
     "parser.add_argument(\"--data_path\", default=None, help=\"Input directory.\")\n",
-    "parser.add_argument(\"--batch_size\", default=None, help=\"Batch size.\")\n",
+    "parser.add_argument(\"--batch_size\", type=int, default=None, help=\"Batch size.\")\n",
     "args = parser.parse_args()\n",
     "\n",
     "DATA_PATH = args.data_path or os.path.expanduser(\"~/workspace\")\n",
@@ -291,54 +274,9 @@
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2023-06-08 04:04:31.525132: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX\n",
-      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "2023-06-08 04:04:31.640485: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "[1,0]<stderr>:2023-06-08 04:04:34.931845: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX\n",
-      "[1,0]<stderr>:To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "[1,1]<stderr>:2023-06-08 04:04:34.939388: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX\n",
-      "[1,1]<stderr>:To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "[1,1]<stderr>:2023-06-08 04:04:35.046788: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "[1,0]<stderr>:2023-06-08 04:04:35.046788: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "[1,1]<stderr>:2023-06-08 04:04:41.286722: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:222] Using CUDA malloc Async allocator for GPU: 0\n",
-      "[1,1]<stderr>:2023-06-08 04:04:41.286821: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24570 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:b3:00.0, compute capability: 8.6\n",
-      "[1,0]<stderr>:2023-06-08 04:04:41.292086: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:222] Using CUDA malloc Async allocator for GPU: 0\n",
-      "[1,0]<stderr>:2023-06-08 04:04:41.292173: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24570 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:17:00.0, compute capability: 8.6\n",
-      "[1,1]<stderr>:WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/autograph/pyct/static_analysis/liveness.py:83: Analyzer.lamba_check (from tensorflow.python.autograph.pyct.static_analysis.liveness) is deprecated and will be removed after 2023-09-23.\n",
-      "[1,1]<stderr>:Instructions for updating:\n",
-      "[1,1]<stderr>:Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089\n",
-      "[1,0]<stderr>:WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/autograph/pyct/static_analysis/liveness.py:83: Analyzer.lamba_check (from tensorflow.python.autograph.pyct.static_analysis.liveness) is deprecated and will be removed after 2023-09-23.\n",
-      "[1,0]<stderr>:Instructions for updating:\n",
-      "[1,0]<stderr>:Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089\n",
-      "[1,0]<stdout>:Step #0\tLoss: 13.976286\n",
-      "[1,0]<stdout>:Step #10\tLoss: 13.746111\n",
-      "[1,0]<stdout>:Step #20\tLoss: 13.905323\n",
-      "[1,0]<stdout>:Step #30\tLoss: 14.093473\n",
-      "[1,0]<stdout>:Step #40\tLoss: 13.336206\n",
-      "[1,0]<stdout>:Step #50\tLoss: 13.932583\n",
-      "[1,0]<stdout>:Step #60\tLoss: 13.702780\n",
-      "[1,0]<stdout>:Step #70\tLoss: 13.522057\n",
-      "[1,0]<stdout>:Step #80\tLoss: 13.382860\n",
-      "[1,0]<stdout>:Step #90\tLoss: 13.701270\n",
-      "[1,0]<stdout>:Step #100\tLoss: 13.240610\n",
-      "[1,0]<stdout>:Step #110\tLoss: 13.264977\n",
-      "[1,0]<stdout>:Step #120\tLoss: 13.984927\n",
-      "[1,0]<stdout>:Step #130\tLoss: 14.039978\n",
-      "[1,0]<stdout>:Step #140\tLoss: 13.639907\n",
-      "[1,0]<stdout>:Step #150\tLoss: 13.430090\n",
-      "[1,0]<stdout>:Step #160\tLoss: 13.219415\n",
-      "[1,0]<stdout>:Step #170\tLoss: 12.758451\n",
-      "[1,0]<stdout>:Step #180\tLoss: 13.592442\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "!horovodrun -np {GPU_COUNT} python tf_trainer.py --data_path={DATA_PATH} --batch_size=65536"
+    "! horovodrun -np {GPU_COUNT} python tf_trainer.py --data_path={DATA_PATH} --batch_size=65536"
    ]
   },
   {
diff --git a/tests/unit/dataloader/test_tf_dataloader.py b/tests/unit/dataloader/test_tf_dataloader.py
index 8cdef6a0..1b7e3c1f 100644
--- a/tests/unit/dataloader/test_tf_dataloader.py
+++ b/tests/unit/dataloader/test_tf_dataloader.py
@@ -718,7 +718,7 @@ def test_wrong_batch_size_raises_warning():
             _ = tf_loader(dataset, batch_size=batch_size)
 
     for power in range(4, 10):
-        batch_size = 2 ** power
+        batch_size = 2**power
         # warning not raised for power of two
         with warnings.catch_warnings():
             warnings.simplefilter("error")

From e193ae18a267c5a94318efc7eb70eeec615045d1 Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Wed, 7 Jun 2023 23:32:20 -0700
Subject: [PATCH 08/10] fix horovodrun command in unit test

---
 tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py b/tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py
index 2605dc99..6b0be9c6 100644
--- a/tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py
+++ b/tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py
@@ -49,6 +49,8 @@ def test_getting_started_tensorflow(tb, tmpdir):
             hvd_wrap_path,
             "python",
             "tf_trainer.py",
+            f"--data_path={str(tmpdir)}",
+            "--batch_size=65536",
         ],
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,

From 39d434707f634561c06dd6ca655afc5c8c3375e3 Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Wed, 7 Jun 2023 23:42:58 -0700
Subject: [PATCH 09/10] minor fixes

---
 merlin/dataloader/utils/tf/tf_trainer.py | 2 +-
 tox.ini                                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/merlin/dataloader/utils/tf/tf_trainer.py b/merlin/dataloader/utils/tf/tf_trainer.py
index 9b5df70a..9e589768 100644
--- a/merlin/dataloader/utils/tf/tf_trainer.py
+++ b/merlin/dataloader/utils/tf/tf_trainer.py
@@ -109,7 +109,7 @@ def seed_fn():
 loss = tf.losses.BinaryCrossentropy()
 opt = tf.keras.optimizers.legacy.SGD(0.01 * hvd.size())
 opt = hvd.DistributedOptimizer(opt)
-checkpoint_dir = "./checkpoints"
+checkpoint_dir = os.path.join(BASE_DIR, "checkpoints")
 checkpoint = tf.train.Checkpoint(model=model, optimizer=opt)
 
 
diff --git a/tox.ini b/tox.ini
index a5950690..b30386d4 100644
--- a/tox.ini
+++ b/tox.ini
@@ -83,7 +83,7 @@ passenv =
     CUDA_VISIBLE_DEVICES
     OPAL_PREFIX
 sitepackages=true
-; Runs in: 1GPU Github Actions runners.
+; Runs in: 2GPU Github Actions runners.
 ; Runs GPU-based tests.
 deps =
     -rrequirements/dev.txt

From bb4a702033fa8cdcf2fa7f473f2706d68b48ec30 Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Wed, 7 Jun 2023 23:56:22 -0700
Subject: [PATCH 10/10] fix checkpoint file check

---
 merlin/dataloader/utils/tf/tf_trainer.py                     | 1 -
 tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/merlin/dataloader/utils/tf/tf_trainer.py b/merlin/dataloader/utils/tf/tf_trainer.py
index 9e589768..8d4e8325 100644
--- a/merlin/dataloader/utils/tf/tf_trainer.py
+++ b/merlin/dataloader/utils/tf/tf_trainer.py
@@ -37,7 +37,6 @@
 BASE_DIR = args.dir_in or "./data/"
 BATCH_SIZE = int(args.batch_size or 16384)  # Batch Size
 CATEGORICAL_COLUMNS = args.cats or ["movieId", "userId"]  # Single-hot
-LABEL_COLUMNS = ["rating"]
 
 hvd.init()
 
diff --git a/tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py b/tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py
index 6b0be9c6..9d48bdad 100644
--- a/tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py
+++ b/tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py
@@ -60,4 +60,4 @@ def test_getting_started_tensorflow(tb, tmpdir):
         print(stdout, stderr)
         assert "Loss:" in str(stdout)
 
-    assert any(f.startswith("checkpoints-") for f in os.listdir(os.getcwd()))
+    assert any(f.startswith("checkpoints-") for f in os.listdir(tmpdir))