Add multiaccelerator H100 tests to optional GPU presubmit.

Google-ML-Automation · Google-ML-Automation · commit ee0291bdef08 · 2025-04-25T13:30:24.000-07:00
PiperOrigin-RevId: 751224072
diff --git a/.github/workflows/bazel_optional_h100_b200.yml b/.github/workflows/bazel_optional_h100_b200.yml
@@ -1,4 +1,4 @@
-name: CI - Bazel Optional B200 CUDA tests
+name: CI - Bazel Optional H100 and B200 CUDA tests
 on:
   # Runs on PR if label "CI Optional GPU Presubmit" is present.
   workflow_dispatch:
@@ -36,10 +36,10 @@ jobs:
         uses: google-ml-infra/actions/ci_connection@main
         with:
           halt-dispatch-input: ${{ inputs.halt-for-connection }}
-      - name: Run Bazel CUDA Tests
+      - name: Run Bazel single B200 CUDA Tests
         run: |
             nvidia-smi
-            bazel test --config=ci_linux_x86_64_cuda \
+            bazel test --config=rbe_linux_x86_64_cuda \
             --config=resultstore \
             --config=rbe_cache \
             --repo_env=HERMETIC_CUDA_VERSION="12.8.0" \
@@ -50,6 +50,7 @@ jobs:
             --test_output=errors \
             --test_env=JAX_ACCELERATOR_COUNT=1 \
             --test_env=JAX_TESTS_PER_ACCELERATOR=32 \
+            --strategy=TestRunner=local \
             --local_test_jobs=32 \
             --test_env=JAX_EXCLUDE_TEST_TARGETS=PmapTest.testSizeOverflow \
             --test_tag_filters=-multiaccelerator \
@@ -60,4 +61,38 @@ jobs:
             --color=yes \
             //tests:gpu_tests //tests:backend_independent_tests \
             //tests/pallas:gpu_tests //tests/pallas:backend_independent_tests \
+            //tests/mosaic:gpu_tests //tests/mosaic:backend_independent_tests
+  run_multiaccelerator_tests:
+    if: ${{ github.event.repository.fork == false && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || contains(github.event.pull_request.labels.*.name, 'CI Optional GPU Presubmit')) }}
+    runs-on: linux-x86-a3-8g-h100-8gpu
+    container: 'us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build-cuda12.8-cudnn9.8:latest'
+    name: "Bazel multiple H100 CUDA tests"
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683  # v4.2.2
+      - name: Wait For Connection
+        uses: google-ml-infra/actions/ci_connection@main
+        with:
+          halt-dispatch-input: ${{ inputs.halt-for-connection }}
+      - name: Run Bazel multiple H100 CUDA Tests
+        run: |
+            nvidia-smi
+            bazel test --config=rbe_linux_x86_64_cuda \
+            --config=resultstore \
+            --config=rbe_cache \
+            --repo_env=HERMETIC_CUDA_VERSION="12.8.0" \
+            --repo_env=HERMETIC_CUDNN_VERSION="9.8.0" \
+            --repo_env=HERMETIC_PYTHON_VERSION="3.13" \
+            --test_env=XLA_PYTHON_CLIENT_ALLOCATOR=platform \
+            --test_output=errors \
+            --strategy=TestRunner=local \
+            --local_test_jobs=8 \
+            --test_env=JAX_EXCLUDE_TEST_TARGETS=PmapTest.testSizeOverflow \
+            --test_tag_filters=multiaccelerator \
+            --test_env=TF_CPP_MIN_LOG_LEVEL=0 \
+            --test_env=JAX_SKIP_SLOW_TESTS=true \
+            --action_env=JAX_ENABLE_X64="1" \
+            --action_env=NCCL_DEBUG=WARN \
+            --color=yes \
+            //tests:gpu_tests //tests:backend_independent_tests \
+            //tests/pallas:gpu_tests //tests/pallas:backend_independent_tests \
             //tests/mosaic:gpu_tests //tests/mosaic:backend_independent_tests