diff --git a/.github/eks-workflow-files/mpi-nccl-test.yml b/.github/eks-workflow-files/mpi-nccl-test.yml
index 0e34cb7a2..003605148 100644
--- a/.github/eks-workflow-files/mpi-nccl-test.yml
+++ b/.github/eks-workflow-files/mpi-nccl-test.yml
@@ -26,6 +26,13 @@ spec:
             - image: PLACEHOLDER
               imagePullPolicy: IfNotPresent
               name: PLACEHOLDER
+              env:
+              - name: PATH
+                value: $PATH:/opt/amazon/efa/bin:/usr/bin:/usr/local/mpi/bin
+              - name: LD_LIBRARY_PATH
+                value: "/usr/local/mpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:/usr/local/cuda/compat/lib.real:$LD_LIBRARY_PATH"
+              - name: NCCL_TUNER_PLUGIN
+                value: /opt/aws-ofi-nccl/install/lib/libnccl-ofi-tuner.so
               command:
                 - bash
                 - -c
@@ -41,8 +48,8 @@ spec:
                     echo "Workers were still not reachable after ${limit}, exiting"
                     exit 1
                   fi
-                  mpirun --allow-run-as-root --tag-output -N 1 -x LD_LIBRARY_PATH=/usr/local/cuda/compat/lib nvidia-smi
-                  mpirun --allow-run-as-root -N 8 -x LD_LIBRARY_PATH=/usr/local/cuda/compat/lib $0 \
+                  mpirun --allow-run-as-root --tag-output -N 1 -x LD_LIBRARY_PATH=${LD_LIBRARY_PATH} nvidia-smi
+                  mpirun --allow-run-as-root -N 8 -x ${PATH} -x LD_LIBRARY_PATH=${LD_LIBRARY_PATH} -x NVIDIA_GDRCOPY=enabled -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_LOG_LEVEL=info -x NCCL_DEBUG=info -x NCCL_TUNER_PLUGIN=${NCCL_TUNER_PLUGIN} $0 \
                     -b 8 \
                     -e 16G \
                     -f 2 \
@@ -65,6 +72,7 @@ spec:
             - image: PLACEHOLDER
               imagePullPolicy: IfNotPresent
               name: PLACEHOLDER
+              env:
               volumeMounts:
                 - name: shmem
                   mountPath: /dev/shm
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index aafc32fdb..45a785b34 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -54,753 +54,758 @@ jobs:
     secrets: inherit
 
   test-nccl:
-    if: inputs.ARCHITECTURE == 'amd64' # build only amd64
-    needs: build-base
-    uses: ./.github/workflows/_test_nccl.yaml
-    with:
-      CONTAINER: ${{ needs.build-base.outputs.DOCKER_TAG }}
-    secrets: inherit
-
-  build-jax:
-    needs: build-base
-    runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "large"]
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - name: Build JAX container
-          id: build-jax
-          uses: ./.github/actions/build-container
-          with:
-            ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-            ARTIFACT_NAME: artifact-jax-build
-            BADGE_FILENAME: badge-jax-build
-            BUILD_DATE: ${{ inputs.BUILD_DATE }}
-            BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }}
-            CONTAINER_NAME: jax
-            DOCKERFILE: .github/container/Dockerfile.jax
-            RUNNER_SIZE: large
-            ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
-            ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
-            github-token: ${{ secrets.GITHUB_TOKEN }}
-            bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
-            EXTRA_BUILD_ARGS: |
-              URLREF_JAX=${{ fromJson(inputs.SOURCE_URLREFS).JAX }}
-              URLREF_XLA=${{ fromJson(inputs.SOURCE_URLREFS).XLA }}
-              URLREF_FLAX=${{ fromJson(inputs.SOURCE_URLREFS).FLAX }}
-              URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
-    outputs:
-      DOCKER_TAG_MEALKIT: ${{ steps.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      DOCKER_TAG_FINAL:   ${{ steps.build-jax.outputs.DOCKER_TAG_FINAL }}
-
-  build-equinox:
-    needs: build-jax
-    runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"]
-    outputs:
-      DOCKER_TAG_MEALKIT: ${{ steps.build-equinox.outputs.DOCKER_TAG_MEALKIT }}
-      DOCKER_TAG_FINAL:   ${{ steps.build-equinox.outputs.DOCKER_TAG_FINAL }}
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-      - name: Build Equinox container
-        id: build-equinox
-        uses: ./.github/actions/build-container
-        with:
-          ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-          ARTIFACT_NAME: artifact-equinox-build
-          BADGE_FILENAME: badge-equinox-build
-          BUILD_DATE: ${{ inputs.BUILD_DATE }}
-          BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-          CONTAINER_NAME: equinox
-          DOCKERFILE: .github/container/Dockerfile.equinox
-          RUNNER_SIZE: small
-          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
-          ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
-          EXTRA_BUILD_ARGS: |
-            URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
-
-  build-maxtext:
-    needs: build-jax
-    runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"]
-    outputs:
-      DOCKER_TAG_MEALKIT: ${{ steps.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}
-      DOCKER_TAG_FINAL:   ${{ steps.build-maxtext.outputs.DOCKER_TAG_FINAL }}
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-      - name: Build MaxText container
-        id: build-maxtext
-        uses: ./.github/actions/build-container
-        with:
-          ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-          ARTIFACT_NAME: artifact-maxtext-build
-          BADGE_FILENAME: badge-maxtext-build
-          BUILD_DATE: ${{ inputs.BUILD_DATE }}
-          BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-          CONTAINER_NAME: maxtext
-          DOCKERFILE: .github/container/Dockerfile.maxtext
-          RUNNER_SIZE: small
-          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
-          ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
-          EXTRA_BUILD_ARGS: |
-            URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
-            URLREF_JETSTREAM=${{ fromJson(inputs.SOURCE_URLREFS).GOOGLE_JETSTREAM }}
-
-  build-upstream-t5x:
-    needs: build-jax
-    runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"]
-    outputs:
-      DOCKER_TAG_MEALKIT: ${{ steps.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
-      DOCKER_TAG_FINAL:   ${{ steps.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}
-    steps:
-      - id: checkout
-        name: Checkout repository
-        uses: actions/checkout@v4
-      - name: Build Upstream T5X container
-        id: build-upstream-t5x
-        uses: ./.github/actions/build-container
-        with:
-          ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-          ARTIFACT_NAME: artifact-t5x-build
-          BADGE_FILENAME: badge-t5x-build
-          BUILD_DATE: ${{ inputs.BUILD_DATE }}
-          BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-          CONTAINER_NAME: upstream-t5x
-          DOCKERFILE: .github/container/Dockerfile.t5x
-          RUNNER_SIZE: small
-          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
-          ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
-          EXTRA_BUILD_ARGS: |
-            URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }}
-            URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }}
-
-  build-axlearn:
-    needs: build-jax
-    runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "large"]
-    outputs:
-      DOCKER_TAG_MEALKIT: ${{ steps.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}
-      DOCKER_TAG_FINAL:   ${{ steps.build-axlearn.outputs.DOCKER_TAG_FINAL }}
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-      - name: Build AxLearn container
-        id: build-axlearn
-        uses: ./.github/actions/build-container
-        with:
-          ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-          ARTIFACT_NAME: artifact-axlearn-build
-          BADGE_FILENAME: badge-axlearn-build
-          BUILD_DATE: ${{ inputs.BUILD_DATE }}
-          BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-          CONTAINER_NAME: axlearn
-          DOCKERFILE: .github/container/Dockerfile.axlearn
-          RUNNER_SIZE: large
-          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
-          ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
-          EXTRA_BUILD_ARGS: |
-            URLREF_AXLEARN=${{ fromJson(inputs.SOURCE_URLREFS).AXLEARN }}
-
-  build-rosetta-t5x:
-    needs: build-upstream-t5x
-    uses: ./.github/workflows/_build_rosetta.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
-      BASE_LIBRARY: t5x
-    secrets: inherit
-
-  collect-docker-tags:
-    runs-on: ubuntu-22.04
-    if: ${{ !cancelled() }}
-    needs:
-      - build-base
-      - build-jax
-      - build-equinox
-      - build-maxtext
-      - build-upstream-t5x
-      - build-axlearn
-      - build-rosetta-t5x
-    outputs:
-      TAGS: ${{ steps.collect-tags.outputs.TAGS }}
-    steps:
-      - name: Save docker tags as a JSON object
-        id: collect-tags
-        run: |
-          TAGS=$(cat <<EOF | jq -c
-          [\
-            {"flavor": "base",         "stage": "final",   "priority": 800,  "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
-            {"flavor": "jax",          "stage": "final",   "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "upstream-t5x", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "t5x",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "axlearn",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "jax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "t5x",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "axlearn",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
-
-            {}\
-          ]
-          EOF
-          )
-
-          echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT
-
-  test-distribution:
-    runs-on: ubuntu-22.04
-    strategy:
-      matrix:
-        TEST_SCRIPT:
-          - extra-only-distribution.sh
-          - mirror-only-distribution.sh
-          - upstream-only-distribution.sh
-          - local-patch-distribution.sh
-      fail-fast: false
-    steps:
-      - name: Print environment variables
-        run: env
-      - name: Set git login for tests
-        run: |
-          git config --global user.email "jax@nvidia.com"
-          git config --global user.name "JAX-Toolbox CI"
-      - name: Check out the repository under ${GITHUB_WORKSPACE}
-        uses: actions/checkout@v4
-      - name: Run integration test ${{ matrix.TEST_SCRIPT }}
-        run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
-
-  test-jax:
-    needs: build-jax
     if: >-
       inputs.ARCHITECTURE == 'amd64' &&
       (
         inputs.MODE == 'full' ||
-        inputs.MODE == 'jax'
+        inputs.MODE == 'nccl'
       )
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: jax
-      EXECUTE: |
-        docker run -i --shm-size=1g --gpus all \
-        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-backend-independent.log
-          test-jax.sh -b backend-independent
-        EOF
-        docker run -i --shm-size=1g --gpus all \
-        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-single-gpu.log
-          nvidia-cuda-mps-control -d
-          test-jax.sh -b single-gpu
-        EOF
-        docker run -i --shm-size=1g --gpus all \
-        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-multi-gpu.log
-          nvidia-cuda-mps-control -d
-          test-jax.sh -b multi-gpu
-        EOF
-      STATISTICS_SCRIPT: |
-        errors=$(cat test-*.log | grep -c 'ERROR:' || true)
-        failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
-        passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-backend-independent.log
-        test-multi-gpu.log
-        test-single-gpu.log
-    secrets: inherit
-
-  test-nsys-jax:
-    needs: build-jax
-    if: >-
-      inputs.ARCHITECTURE == 'amd64' &&
-      (
-        inputs.MODE == 'full' ||
-        inputs.MODE == 'jax'
-      )
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: nsys-jax
-      EXECUTE: |
-        set -o pipefail
-        num_tests=0
-        num_failures=0
-        # Run the pytest-driven tests; failure is explicitly handled below so set +e to
-        # avoid an early abort here.
-        set +e
-        docker run -i --shm-size=1g --gpus all \
-          -v $PWD:/opt/output \
-          ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-          bash <<"EOF" |& tee test-nsys-jax.log
-            # nsys-jax is already installed, this is just adding the test dependencies
-            pip install pytest-reportlog nsys-jax[test]
-            # abuse knowledge that nsys-jax is installed editable, so the tests exist
-            test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
-            pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}"
-        EOF
-        set -e
-        GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
-        for mode in 1-process 2-process process-per-gpu; do
-          DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"
-          if [[ "${mode}" == "1-process" ]]; then
-            PROCESS_COUNT=1
-            ARGS=""
-          elif [[ "${mode}" == "2-process" ]]; then
-            # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that
-            # this will flush out more bugs than process-per-node or process-per-GPU.
-            PROCESS_COUNT=2
-            ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed"
-          else
-            PROCESS_COUNT=${GPUS_PER_NODE}
-            ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed"
-          fi
-          for collection in full partial; do
-            NSYS_JAX="nsys-jax"
-            if [[ "${mode}" == "1-process" ]]; then
-              # We will not run nsys-jax-combine, so run analyses eagerly
-              NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary"
-            fi
-            NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}"
-            if [[ "${collection}" == "partial" ]]; then
-              NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop"
-              # nvbug/4801401
-              NSYS_JAX+=" --sample=none"
-            fi
-            set +e
-            ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \
-              -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log
-            num_failures=$((num_failures + ($? != 0)))
-            set -e
-            num_tests=$((num_tests + 1))
-          done
-          if [[ "${mode}" != "1-process" ]]; then
-            # Run nsys-jax-combine
-            NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip"
-            for (( i=0; i<PROCESS_COUNT; i++ )); do
-              NSYS_JAX_COMBINE+=" /opt/output/${mode}-${collection}-execution-${i}.zip"
-            done
-            set +e
-            ${DOCKER} ${NSYS_JAX_COMBINE} |& tee ${mode}-${collection}-execution-combine.log
-            num_failures=$((num_failures + ($? != 0)))
-            set -e
-            num_tests=$((num_tests + 1))
-          fi
-        done
-        ls -R .
-        echo "NSYS_JAX_TEST_COUNT=${num_tests}" >> $GITHUB_ENV
-        echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV
-        exit $num_failures
-      STATISTICS_SCRIPT: |
-        summary_line=$(tail -n1 test-nsys-jax.log)
-        num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
-        failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
-        total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests ))
-        num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT ))
-        num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT ))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        # pytest-driven part
-        test-nsys-jax.log
-        pytest-report.jsonl
-        # nsys-jax logfiles
-        *process-*-execution.log
-        # nsys-jax output for the case that doesn't use nsys-jax-combine
-        1-process-*-execution-0.zip
-        # nsys-jax-combine output/logfiles
-        *process*-*-execution.zip
-        *-execution-combine.log
-    secrets: inherit
-
-  # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test
-  # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does
-  # not already have nsys-jax installed
-  test-nsys-jax-archive:
-    needs: test-nsys-jax
-    if: >-
-      inputs.ARCHITECTURE == 'amd64' &&
-      (
-        inputs.MODE == 'full' ||
-        inputs.MODE == 'jax'
-      )
-    strategy:
-      matrix:
-        os: [ubuntu-22.04, ubuntu-24.04, macOS-latest]
-    runs-on: ${{ matrix.os }}
-    steps:
-    - name: Download nsys-jax output .zip files
-      uses: actions/download-artifact@v4
-      with:
-        name: nsys-jax-unit-test-A100
-    - name: Extract archives and execute install scripts
-      run: |
-        pip install virtualenv # for install.sh
-        for zip in $(ls *.zip); do
-          ZIP="${PWD}/${zip}"
-          pushd $(mktemp -d)
-          unzip "${ZIP}"
-          ls -l
-          # TODO: verify this isn't needed, or make sure it isn't needed
-          chmod 755 install.sh
-          # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout
-          # Skip executing Jupyter lab
-          NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh
-          popd
-        done
-
-  test-nsys-jax-eks:
-    needs: build-jax
-    if: >-
-      inputs.ARCHITECTURE == 'amd64' &&
-      (
-        inputs.MODE == 'full' ||
-        inputs.MODE == 'jax'
-      )
-    runs-on: eks
-    env:
-      JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
-      JOB_NAME: ${{ github.run_id }}-nsys-jax
-      # Service name cannot start with a number
-      SERVICE_NAME: svc-${{ github.run_id}}-nsys-jax
-      POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess
-    steps:
-    - name: Check out the repository
-      uses: actions/checkout@v4
-    - name: Login to GitHub Container Registry
-      uses: docker/login-action@v3
-      with:
-        registry: ghcr.io
-        username: ${{ github.repository_owner }}
-        password: ${{ secrets.GITHUB_TOKEN }}
-    - name: K8s GHCR store and delete token
-      id: store-token
-      uses: ./.github/actions/store-delete-k8s-ghcr
-    - name: Configure Kubernetes job
-      run: |
-        yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
-          | select(di == 0).metadata.name = strenv(SERVICE_NAME)
-          | select(di == 1).metadata.name = strenv(JOB_NAME)
-          | select(di == 1).spec.template.spec.subdomain = strenv(SERVICE_NAME)
-          | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
-          | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
-          | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)
-          | select(di == 1).spec.template.spec.containers[0].env[1].value = strenv(SERVICE_NAME)' \
-          .github/eks-workflow-files/job.yml
-        git diff .github/eks-workflow-files/job.yml
-    - name: Submit Kubernetes job
-      uses: ./.github/actions/submit-delete-k8s-job
-      with:
-        job-config-file: .github/eks-workflow-files/job.yml
-        job-name: ${{ env.JOB_NAME }}
-    - name: Configure post-processing job
-      run: |
-        export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
-        yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
-          | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
-          | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
-          | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
-          .github/eks-workflow-files/post-process-job.yml
-        git diff .github/eks-workflow-files/post-process-job.yml
-    - name: Submit post process Kubernetes job
-      uses: ./.github/actions/submit-delete-k8s-job
-      with:
-        job-config-file: .github/eks-workflow-files/post-process-job.yml
-        job-name: ${{ env.POSTPROCESS_JOB_NAME }}
-
-  test-te-h100:
-    needs: build-jax
-    if: >-
-      inputs.ARCHITECTURE == 'amd64' &&
-      (
-        inputs.MODE == 'full' ||
-        inputs.MODE == 'te'
-      )
-    uses: ./.github/workflows/_transformer_engine_eks.yaml
-    with:
-      JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
-      JOB_NAME: transformerengine-${{ github.run_id }}
-      S3_BUCKET: jax-toolbox-eks-output
-      CI_NAME: transformer-engine
-    secrets: inherit
-
-  test-jax-cutlass-h100:
-    needs: build-jax
-    if: >-
-      inputs.ARCHITECTURE == 'amd64' &&
-      (
-        inputs.MODE == 'full' ||
-        inputs.MODE == 'jax-cutlass'
-      )
-    uses: ./.github/workflows/_jax_cutlass_eks.yaml
-    with:
-      JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
-      JOB_NAME: jax-cutlass-${{ github.run_id }}
-      S3_BUCKET: jax-toolbox-eks-output
-      CI_NAME: jax-cutlass
-    secrets: inherit
-
-  test-te-a100:
-    needs: build-jax
-    secrets: inherit
-    if: >-
-      inputs.ARCHITECTURE == 'amd64' &&
-      (
-        inputs.MODE == 'full' ||
-        inputs.MODE == 'te'
-      )
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: te
-      EXECUTE: |
-        docker run -i --gpus all --shm-size=1g -v $PWD:/log \
-        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-te.log
-          set -xu -o pipefail
-
-          LOG_DIR=/log
-
-          pip install pytest-reportlog pytest-xdist
-          # Start MPS daemon
-          nvidia-cuda-mps-control -d
-          # TE's default is slightly different, without the hyphen
-          export TE_PATH=${SRC_PATH_TRANSFORMER_ENGINE}
-          # 1 GPU per worker, 3 workers per GPU
-          pytest-xdist.sh 1 3 ${LOG_DIR}/pytest-report-L0-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_unittest/test.sh
-          ## 8 GPUs per worker, 1 worker per GPU. pytest-xdist.sh allows aggregation
-          ## into a single .jsonl file of results from multiple pytest invocations
-          ## inside the test.sh script, so it's useful even with a single worker per
-          ## device.
-          pytest-xdist.sh 8 1 ${LOG_DIR}/pytest-report-L0-distributed-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_distributed_unittest/test.sh
-
-          # merge the log files
-          cat \
-            ${LOG_DIR}/pytest-report-L0-unittest.jsonl \
-            ${LOG_DIR}/pytest-report-L0-distributed-unittest.jsonl \
-            > ${LOG_DIR}/pytest-report.jsonl
-
-        EOF
-      STATISTICS_SCRIPT: |
-        report_json=pytest-report.jsonl
-        summary_line=$(tail -n1 test-te.log)
-        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(cat $report_json | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
-        failed_tests=$(cat $report_json | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-
-        echo "$failed_tests tests failed"
-        if [[ $failed_tests -gt 0 ]]; then
-            exit 1
-        else
-            exit 0
-        fi
-
-      TIMEOUT_MINUTES: 120
-      ARTIFACTS: |
-        test-te.log
-        pytest-report.jsonl
-        pytest-report-L0-unittest.jsonl
-        pytest-report-L0-distributed-unittest.jsonl
-
-  test-rosetta-t5x:
-    needs: build-rosetta-t5x
-    if: >-
-      inputs.ARCHITECTURE == 'amd64' &&
-      (
-        inputs.MODE == 'full' ||
-        inputs.MODE == 't5x'
-      )
-    uses: ./.github/workflows/_test_t5x_rosetta.yaml
-    with:
-      T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
-
-  test-maxtext:
-    needs: build-maxtext
-    if: >-
-      inputs.ARCHITECTURE == 'amd64' &&
-      (
-        inputs.MODE == 'full' ||
-        inputs.MODE == 'maxtext'
-      )
-    uses: ./.github/workflows/_test_maxtext.yaml
-    with:
-      MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
-
-  test-maxtext-gke:
-    needs: build-maxtext
-    if: >-
-      inputs.ARCHITECTURE == 'amd64' &&
-      (
-        inputs.MODE == 'full' ||
-        inputs.MODE == 'maxtext'
-      )
-    uses: ./.github/workflows/_test_maxtext_gke_xpk.yaml
+    needs: build-base
+    uses: ./.github/workflows/_test_nccl.yaml
     with:
-      MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
+      CONTAINER: ${{ needs.build-base.outputs.DOCKER_TAG }}
     secrets: inherit
 
-  test-axlearn-eks:
-    needs: build-axlearn
-    if: >-
-      inputs.ARCHITECTURE == 'amd64' &&
-      (
-        inputs.MODE == 'full' ||
-        inputs.MODE == 'axlearn'
-      )
-    runs-on: eks
-    env:
-      AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
-      JOB_NAME: axlearn-${{ github.run_id }}
-    steps:
-    - name: Check out the repository
-      uses: actions/checkout@v4
-    - name: Login to GitHub Container Registry
-      uses: docker/login-action@v3
-      with:
-        registry: ghcr.io
-        username: ${{ github.repository_owner }}
-        password: ${{ secrets.GITHUB_TOKEN }}
-    - name: K8s GHCR store and delete token
-      id: store-token
-      uses: ./.github/actions/store-delete-k8s-ghcr
-    - name: Configure axlearn test job
-      run: |
-        # Replace placeholders in axlearn-job.yml with environment variables
-        yq -i ea '
-           select(di == 0).metadata.name = strenv(JOB_NAME)
-          | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
-          | select(di == 0).spec.template.spec.containers[0].env[0].value = "${{ github.run_id }}"
-          | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
-        .github/eks-workflow-files/axlearn/axlearn-job.yml
-        git diff .github/eks-workflow-files/axlearn/axlearn-job.yml
-    - name: Submit & delete axlearn test
-      uses: ./.github/actions/submit-delete-k8s-job
-      with:
-        job-config-file: ".github/eks-workflow-files/axlearn/axlearn-job.yml"
-        job-name: ${{ env.JOB_NAME }}
-    - name: Download logs from S3
-      id: log-s3
-      if: ${{ !cancelled() }}
-      run: |
-        mkdir -p axlearn-output
-        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/summary.txt axlearn-output/
-        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.log"
-        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.xml"
-
-
-        passed_tests=$(grep -Eo 'PASSED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' )
-        failed_tests=$(grep -Eo 'FAILED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' )
-        skipped_tests=$(grep -Eo 'SKIPPED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' )
-        total_tests=$((failed_tests + passed_tests + skipped_tests))
-
-        echo "Passed tests: $passed_tests"
-        echo "Failed tests: $failed_tests"
-        echo "Skipped tests: $skipped_tests"
-        echo "Total tests: $total_tests"
-        echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT
-        echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
-    - name: Generate sitrep
-      id: sitrep
-      if: ${{ !cancelled() }}
-      shell: bash -x -e {0}
-      run: |
-        # bring in utility functions
-        source .github/workflows/scripts/to_json.sh
-
-        badge_label='Axlearn EKS Unit'
-
-        total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \
-        failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \
-        passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \
-        errors="0" \
-        summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
-        badge_message="Passed $passed_tests out of $total_tests." \
-        badge_color="brightgreen"
-        if [ "$failed_tests" -gt 0 ]; then
-          badge_color="red"
-        fi \
-
-        to_json \
-          summary \
-          errors total_tests passed_tests failed_tests \
-          badge_label badge_color badge_message \
-        > sitrep.json
-
-        schemaVersion=1 \
-        label="${badge_label}" \
-        message="Passed $passed_tests out of $total_tests." \
-        color=$badge_color \
-        to_json schemaVersion label message color \
-        > badge-axlearn-test.json
-
-    - name: Upload artifacts
-      if: ${{ !cancelled() }}
-      uses: actions/upload-artifact@v4
-      with:
-        name: "artifact-axlearn-test"
-        path: |
-          sitrep.json
-          badge-axlearn-test.json
-          axlearn-unittests.jsonl
-          axlearn-output/*
-
-  test-axlearn-fuji-models-eks:
-    needs: build-axlearn
-    if: >-
-      inputs.ARCHITECTURE == 'amd64' &&
-      (
-        inputs.MODE == 'full' ||
-        inputs.MODE == 'axlearn'
-      )
-    runs-on: eks
-    env:
-      AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
-      JOB_NAME: axlearn-fuji-3b-${{ github.run_id }}
-    steps:
-    - name: Check out the repository
-      uses: actions/checkout@v4
-    - name: Login to GitHub Container Registry
-      uses: docker/login-action@v3
-      with:
-        registry: ghcr.io
-        username: ${{ github.repository_owner }}
-        password: ${{ secrets.GITHUB_TOKEN }}
-    - name: K8s GHCR store and delete token
-      id: store-token
-      uses: ./.github/actions/store-delete-k8s-ghcr
-    - name: Configure axlearn test job
-      run: |
-        yq -i ea '
-           select(di == 0).metadata.name = strenv(JOB_NAME)
-          | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
-          | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
-        .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
-        git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
-
-    - name: Submit & delete axlearn fuji model test
-      uses: ./.github/actions/submit-delete-k8s-job
-      with:
-        job-config-file:  ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml"
-        job-name: ${{ env.JOB_NAME }}
+  # build-jax:
+  #   needs: build-base
+  #   runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "large"]
+  #   steps:
+  #       - name: Checkout repository
+  #         uses: actions/checkout@v4
+  #       - name: Build JAX container
+  #         id: build-jax
+  #         uses: ./.github/actions/build-container
+  #         with:
+  #           ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #           ARTIFACT_NAME: artifact-jax-build
+  #           BADGE_FILENAME: badge-jax-build
+  #           BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #           BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }}
+  #           CONTAINER_NAME: jax
+  #           DOCKERFILE: .github/container/Dockerfile.jax
+  #           RUNNER_SIZE: large
+  #           ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+  #           ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
+  #           github-token: ${{ secrets.GITHUB_TOKEN }}
+  #           bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
+  #           EXTRA_BUILD_ARGS: |
+  #             URLREF_JAX=${{ fromJson(inputs.SOURCE_URLREFS).JAX }}
+  #             URLREF_XLA=${{ fromJson(inputs.SOURCE_URLREFS).XLA }}
+  #             URLREF_FLAX=${{ fromJson(inputs.SOURCE_URLREFS).FLAX }}
+  #             URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
+  #   outputs:
+  #     DOCKER_TAG_MEALKIT: ${{ steps.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     DOCKER_TAG_FINAL:   ${{ steps.build-jax.outputs.DOCKER_TAG_FINAL }}
+
+  # build-equinox:
+  #   needs: build-jax
+  #   runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"]
+  #   outputs:
+  #     DOCKER_TAG_MEALKIT: ${{ steps.build-equinox.outputs.DOCKER_TAG_MEALKIT }}
+  #     DOCKER_TAG_FINAL:   ${{ steps.build-equinox.outputs.DOCKER_TAG_FINAL }}
+  #   steps:
+  #     - name: Checkout repository
+  #       uses: actions/checkout@v4
+  #     - name: Build Equinox container
+  #       id: build-equinox
+  #       uses: ./.github/actions/build-container
+  #       with:
+  #         ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #         ARTIFACT_NAME: artifact-equinox-build
+  #         BADGE_FILENAME: badge-equinox-build
+  #         BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #         BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #         CONTAINER_NAME: equinox
+  #         DOCKERFILE: .github/container/Dockerfile.equinox
+  #         RUNNER_SIZE: small
+  #         ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+  #         ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
+  #         github-token: ${{ secrets.GITHUB_TOKEN }}
+  #         bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
+  #         EXTRA_BUILD_ARGS: |
+  #           URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
+
+  # build-maxtext:
+  #   needs: build-jax
+  #   runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"]
+  #   outputs:
+  #     DOCKER_TAG_MEALKIT: ${{ steps.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}
+  #     DOCKER_TAG_FINAL:   ${{ steps.build-maxtext.outputs.DOCKER_TAG_FINAL }}
+  #   steps:
+  #     - name: Checkout repository
+  #       uses: actions/checkout@v4
+  #     - name: Build MaxText container
+  #       id: build-maxtext
+  #       uses: ./.github/actions/build-container
+  #       with:
+  #         ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #         ARTIFACT_NAME: artifact-maxtext-build
+  #         BADGE_FILENAME: badge-maxtext-build
+  #         BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #         BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #         CONTAINER_NAME: maxtext
+  #         DOCKERFILE: .github/container/Dockerfile.maxtext
+  #         RUNNER_SIZE: small
+  #         ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+  #         ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
+  #         github-token: ${{ secrets.GITHUB_TOKEN }}
+  #         bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
+  #         EXTRA_BUILD_ARGS: |
+  #           URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
+  #           URLREF_JETSTREAM=${{ fromJson(inputs.SOURCE_URLREFS).GOOGLE_JETSTREAM }}
+
+  # build-upstream-t5x:
+  #   needs: build-jax
+  #   runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"]
+  #   outputs:
+  #     DOCKER_TAG_MEALKIT: ${{ steps.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
+  #     DOCKER_TAG_FINAL:   ${{ steps.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}
+  #   steps:
+  #     - id: checkout
+  #       name: Checkout repository
+  #       uses: actions/checkout@v4
+  #     - name: Build Upstream T5X container
+  #       id: build-upstream-t5x
+  #       uses: ./.github/actions/build-container
+  #       with:
+  #         ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #         ARTIFACT_NAME: artifact-t5x-build
+  #         BADGE_FILENAME: badge-t5x-build
+  #         BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #         BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #         CONTAINER_NAME: upstream-t5x
+  #         DOCKERFILE: .github/container/Dockerfile.t5x
+  #         RUNNER_SIZE: small
+  #         ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+  #         ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
+  #         github-token: ${{ secrets.GITHUB_TOKEN }}
+  #         bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
+  #         EXTRA_BUILD_ARGS: |
+  #           URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }}
+  #           URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }}
+
+  # build-axlearn:
+  #   needs: build-jax
+  #   runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "large"]
+  #   outputs:
+  #     DOCKER_TAG_MEALKIT: ${{ steps.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}
+  #     DOCKER_TAG_FINAL:   ${{ steps.build-axlearn.outputs.DOCKER_TAG_FINAL }}
+  #   steps:
+  #     - name: Checkout repository
+  #       uses: actions/checkout@v4
+  #     - name: Build AxLearn container
+  #       id: build-axlearn
+  #       uses: ./.github/actions/build-container
+  #       with:
+  #         ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #         ARTIFACT_NAME: artifact-axlearn-build
+  #         BADGE_FILENAME: badge-axlearn-build
+  #         BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #         BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #         CONTAINER_NAME: axlearn
+  #         DOCKERFILE: .github/container/Dockerfile.axlearn
+  #         RUNNER_SIZE: large
+  #         ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+  #         ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
+  #         github-token: ${{ secrets.GITHUB_TOKEN }}
+  #         bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
+  #         EXTRA_BUILD_ARGS: |
+  #           URLREF_AXLEARN=${{ fromJson(inputs.SOURCE_URLREFS).AXLEARN }}
+
+  # build-rosetta-t5x:
+  #   needs: build-upstream-t5x
+  #   uses: ./.github/workflows/_build_rosetta.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
+  #     BASE_LIBRARY: t5x
+  #   secrets: inherit
+
+  # collect-docker-tags:
+  #   runs-on: ubuntu-22.04
+  #   if: ${{ !cancelled() }}
+  #   needs:
+  #     - build-base
+  #     - build-jax
+  #     - build-equinox
+  #     - build-maxtext
+  #     - build-upstream-t5x
+  #     - build-axlearn
+  #     - build-rosetta-t5x
+  #   outputs:
+  #     TAGS: ${{ steps.collect-tags.outputs.TAGS }}
+  #   steps:
+  #     - name: Save docker tags as a JSON object
+  #       id: collect-tags
+  #       run: |
+  #         TAGS=$(cat <<EOF | jq -c
+  #         [\
+  #           {"flavor": "base",         "stage": "final",   "priority": 800,  "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
+  #           {"flavor": "jax",          "stage": "final",   "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
+  #           {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
+  #           {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
+  #           {"flavor": "upstream-t5x", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
+  #           {"flavor": "t5x",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
+  #           {"flavor": "axlearn",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
+  #           {"flavor": "jax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
+  #           {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
+  #           {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
+  #           {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
+  #           {"flavor": "t5x",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
+  #           {"flavor": "axlearn",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
+
+  #           {}\
+  #         ]
+  #         EOF
+  #         )
+
+  #         echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT
+
+  # test-distribution:
+  #   runs-on: ubuntu-22.04
+  #   strategy:
+  #     matrix:
+  #       TEST_SCRIPT:
+  #         - extra-only-distribution.sh
+  #         - mirror-only-distribution.sh
+  #         - upstream-only-distribution.sh
+  #         - local-patch-distribution.sh
+  #     fail-fast: false
+  #   steps:
+  #     - name: Print environment variables
+  #       run: env
+  #     - name: Set git login for tests
+  #       run: |
+  #         git config --global user.email "jax@nvidia.com"
+  #         git config --global user.name "JAX-Toolbox CI"
+  #     - name: Check out the repository under ${GITHUB_WORKSPACE}
+  #       uses: actions/checkout@v4
+  #     - name: Run integration test ${{ matrix.TEST_SCRIPT }}
+  #       run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
+
+  # test-jax:
+  #   needs: build-jax
+  #   if: >-
+  #     inputs.ARCHITECTURE == 'amd64' &&
+  #     (
+  #       inputs.MODE == 'full' ||
+  #       inputs.MODE == 'jax'
+  #     )
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: jax
+  #     EXECUTE: |
+  #       docker run -i --shm-size=1g --gpus all \
+  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-backend-independent.log
+  #         test-jax.sh -b backend-independent
+  #       EOF
+  #       docker run -i --shm-size=1g --gpus all \
+  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-single-gpu.log
+  #         nvidia-cuda-mps-control -d
+  #         test-jax.sh -b single-gpu
+  #       EOF
+  #       docker run -i --shm-size=1g --gpus all \
+  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-multi-gpu.log
+  #         nvidia-cuda-mps-control -d
+  #         test-jax.sh -b multi-gpu
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       errors=$(cat test-*.log | grep -c 'ERROR:' || true)
+  #       failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
+  #       passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
+  #       total_tests=$((failed_tests + passed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-backend-independent.log
+  #       test-multi-gpu.log
+  #       test-single-gpu.log
+  #   secrets: inherit
+
+  # test-nsys-jax:
+  #   needs: build-jax
+  #   if: >-
+  #     inputs.ARCHITECTURE == 'amd64' &&
+  #     (
+  #       inputs.MODE == 'full' ||
+  #       inputs.MODE == 'jax'
+  #     )
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: nsys-jax
+  #     EXECUTE: |
+  #       set -o pipefail
+  #       num_tests=0
+  #       num_failures=0
+  #       # Run the pytest-driven tests; failure is explicitly handled below so set +e to
+  #       # avoid an early abort here.
+  #       set +e
+  #       docker run -i --shm-size=1g --gpus all \
+  #         -v $PWD:/opt/output \
+  #         ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #         bash <<"EOF" |& tee test-nsys-jax.log
+  #           # nsys-jax is already installed, this is just adding the test dependencies
+  #           pip install pytest-reportlog nsys-jax[test]
+  #           # abuse knowledge that nsys-jax is installed editable, so the tests exist
+  #           test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
+  #           pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}"
+  #       EOF
+  #       set -e
+  #       GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
+  #       for mode in 1-process 2-process process-per-gpu; do
+  #         DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"
+  #         if [[ "${mode}" == "1-process" ]]; then
+  #           PROCESS_COUNT=1
+  #           ARGS=""
+  #         elif [[ "${mode}" == "2-process" ]]; then
+  #           # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that
+  #           # this will flush out more bugs than process-per-node or process-per-GPU.
+  #           PROCESS_COUNT=2
+  #           ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed"
+  #         else
+  #           PROCESS_COUNT=${GPUS_PER_NODE}
+  #           ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed"
+  #         fi
+  #         for collection in full partial; do
+  #           NSYS_JAX="nsys-jax"
+  #           if [[ "${mode}" == "1-process" ]]; then
+  #             # We will not run nsys-jax-combine, so run analyses eagerly
+  #             NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary"
+  #           fi
+  #           NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}"
+  #           if [[ "${collection}" == "partial" ]]; then
+  #             NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop"
+  #             # nvbug/4801401
+  #             NSYS_JAX+=" --sample=none"
+  #           fi
+  #           set +e
+  #           ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \
+  #             -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log
+  #           num_failures=$((num_failures + ($? != 0)))
+  #           set -e
+  #           num_tests=$((num_tests + 1))
+  #         done
+  #         if [[ "${mode}" != "1-process" ]]; then
+  #           # Run nsys-jax-combine
+  #           NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip"
+  #           for (( i=0; i<PROCESS_COUNT; i++ )); do
+  #             NSYS_JAX_COMBINE+=" /opt/output/${mode}-${collection}-execution-${i}.zip"
+  #           done
+  #           set +e
+  #           ${DOCKER} ${NSYS_JAX_COMBINE} |& tee ${mode}-${collection}-execution-combine.log
+  #           num_failures=$((num_failures + ($? != 0)))
+  #           set -e
+  #           num_tests=$((num_tests + 1))
+  #         fi
+  #       done
+  #       ls -R .
+  #       echo "NSYS_JAX_TEST_COUNT=${num_tests}" >> $GITHUB_ENV
+  #       echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV
+  #       exit $num_failures
+  #     STATISTICS_SCRIPT: |
+  #       summary_line=$(tail -n1 test-nsys-jax.log)
+  #       num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
+  #       failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
+  #       total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests ))
+  #       num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT ))
+  #       num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT ))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       # pytest-driven part
+  #       test-nsys-jax.log
+  #       pytest-report.jsonl
+  #       # nsys-jax logfiles
+  #       *process-*-execution.log
+  #       # nsys-jax output for the case that doesn't use nsys-jax-combine
+  #       1-process-*-execution-0.zip
+  #       # nsys-jax-combine output/logfiles
+  #       *process*-*-execution.zip
+  #       *-execution-combine.log
+  #   secrets: inherit
+
+  # # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test
+  # # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does
+  # # not already have nsys-jax installed
+  # test-nsys-jax-archive:
+  #   needs: test-nsys-jax
+  #   if: >-
+  #     inputs.ARCHITECTURE == 'amd64' &&
+  #     (
+  #       inputs.MODE == 'full' ||
+  #       inputs.MODE == 'jax'
+  #     )
+  #   strategy:
+  #     matrix:
+  #       os: [ubuntu-22.04, ubuntu-24.04, macOS-latest]
+  #   runs-on: ${{ matrix.os }}
+  #   steps:
+  #   - name: Download nsys-jax output .zip files
+  #     uses: actions/download-artifact@v4
+  #     with:
+  #       name: nsys-jax-unit-test-A100
+  #   - name: Extract archives and execute install scripts
+  #     run: |
+  #       pip install virtualenv # for install.sh
+  #       for zip in $(ls *.zip); do
+  #         ZIP="${PWD}/${zip}"
+  #         pushd $(mktemp -d)
+  #         unzip "${ZIP}"
+  #         ls -l
+  #         # TODO: verify this isn't needed, or make sure it isn't needed
+  #         chmod 755 install.sh
+  #         # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout
+  #         # Skip executing Jupyter lab
+  #         NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh
+  #         popd
+  #       done
+
+  # test-nsys-jax-eks:
+  #   needs: build-jax
+  #   if: >-
+  #     inputs.ARCHITECTURE == 'amd64' &&
+  #     (
+  #       inputs.MODE == 'full' ||
+  #       inputs.MODE == 'jax'
+  #     )
+  #   runs-on: eks
+  #   env:
+  #     JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
+  #     JOB_NAME: ${{ github.run_id }}-nsys-jax
+  #     # Service name cannot start with a number
+  #     SERVICE_NAME: svc-${{ github.run_id}}-nsys-jax
+  #     POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess
+  #   steps:
+  #   - name: Check out the repository
+  #     uses: actions/checkout@v4
+  #   - name: Login to GitHub Container Registry
+  #     uses: docker/login-action@v3
+  #     with:
+  #       registry: ghcr.io
+  #       username: ${{ github.repository_owner }}
+  #       password: ${{ secrets.GITHUB_TOKEN }}
+  #   - name: K8s GHCR store and delete token
+  #     id: store-token
+  #     uses: ./.github/actions/store-delete-k8s-ghcr
+  #   - name: Configure Kubernetes job
+  #     run: |
+  #       yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
+  #         | select(di == 0).metadata.name = strenv(SERVICE_NAME)
+  #         | select(di == 1).metadata.name = strenv(JOB_NAME)
+  #         | select(di == 1).spec.template.spec.subdomain = strenv(SERVICE_NAME)
+  #         | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
+  #         | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
+  #         | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)
+  #         | select(di == 1).spec.template.spec.containers[0].env[1].value = strenv(SERVICE_NAME)' \
+  #         .github/eks-workflow-files/job.yml
+  #       git diff .github/eks-workflow-files/job.yml
+  #   - name: Submit Kubernetes job
+  #     uses: ./.github/actions/submit-delete-k8s-job
+  #     with:
+  #       job-config-file: .github/eks-workflow-files/job.yml
+  #       job-name: ${{ env.JOB_NAME }}
+  #   - name: Configure post-processing job
+  #     run: |
+  #       export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
+  #       yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
+  #         | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
+  #         | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
+  #         | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
+  #         .github/eks-workflow-files/post-process-job.yml
+  #       git diff .github/eks-workflow-files/post-process-job.yml
+  #   - name: Submit post process Kubernetes job
+  #     uses: ./.github/actions/submit-delete-k8s-job
+  #     with:
+  #       job-config-file: .github/eks-workflow-files/post-process-job.yml
+  #       job-name: ${{ env.POSTPROCESS_JOB_NAME }}
+
+  # test-te-h100:
+  #   needs: build-jax
+  #   if: >-
+  #     inputs.ARCHITECTURE == 'amd64' &&
+  #     (
+  #       inputs.MODE == 'full' ||
+  #       inputs.MODE == 'te'
+  #     )
+  #   uses: ./.github/workflows/_transformer_engine_eks.yaml
+  #   with:
+  #     JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
+  #     JOB_NAME: transformerengine-${{ github.run_id }}
+  #     S3_BUCKET: jax-toolbox-eks-output
+  #     CI_NAME: transformer-engine
+  #   secrets: inherit
+
+  # test-jax-cutlass-h100:
+  #   needs: build-jax
+  #   if: >-
+  #     inputs.ARCHITECTURE == 'amd64' &&
+  #     (
+  #       inputs.MODE == 'full' ||
+  #       inputs.MODE == 'jax-cutlass'
+  #     )
+  #   uses: ./.github/workflows/_jax_cutlass_eks.yaml
+  #   with:
+  #     JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
+  #     JOB_NAME: jax-cutlass-${{ github.run_id }}
+  #     S3_BUCKET: jax-toolbox-eks-output
+  #     CI_NAME: jax-cutlass
+  #   secrets: inherit
+
+  # test-te-a100:
+  #   needs: build-jax
+  #   secrets: inherit
+  #   if: >-
+  #     inputs.ARCHITECTURE == 'amd64' &&
+  #     (
+  #       inputs.MODE == 'full' ||
+  #       inputs.MODE == 'te'
+  #     )
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: te
+  #     EXECUTE: |
+  #       docker run -i --gpus all --shm-size=1g -v $PWD:/log \
+  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-te.log
+  #         set -xu -o pipefail
+
+  #         LOG_DIR=/log
+
+  #         pip install pytest-reportlog pytest-xdist
+  #         # Start MPS daemon
+  #         nvidia-cuda-mps-control -d
+  #         # TE's default is slightly different, without the hyphen
+  #         export TE_PATH=${SRC_PATH_TRANSFORMER_ENGINE}
+  #         # 1 GPU per worker, 3 workers per GPU
+  #         pytest-xdist.sh 1 3 ${LOG_DIR}/pytest-report-L0-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_unittest/test.sh
+  #         ## 8 GPUs per worker, 1 worker per GPU. pytest-xdist.sh allows aggregation
+  #         ## into a single .jsonl file of results from multiple pytest invocations
+  #         ## inside the test.sh script, so it's useful even with a single worker per
+  #         ## device.
+  #         pytest-xdist.sh 8 1 ${LOG_DIR}/pytest-report-L0-distributed-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_distributed_unittest/test.sh
+
+  #         # merge the log files
+  #         cat \
+  #           ${LOG_DIR}/pytest-report-L0-unittest.jsonl \
+  #           ${LOG_DIR}/pytest-report-L0-distributed-unittest.jsonl \
+  #           > ${LOG_DIR}/pytest-report.jsonl
+
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       report_json=pytest-report.jsonl
+  #       summary_line=$(tail -n1 test-te.log)
+  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(cat $report_json | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
+  #       failed_tests=$(cat $report_json | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
+  #       total_tests=$((failed_tests + passed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+
+  #       echo "$failed_tests tests failed"
+  #       if [[ $failed_tests -gt 0 ]]; then
+  #           exit 1
+  #       else
+  #           exit 0
+  #       fi
+
+  #     TIMEOUT_MINUTES: 120
+  #     ARTIFACTS: |
+  #       test-te.log
+  #       pytest-report.jsonl
+  #       pytest-report-L0-unittest.jsonl
+  #       pytest-report-L0-distributed-unittest.jsonl
+
+  # test-rosetta-t5x:
+  #   needs: build-rosetta-t5x
+  #   if: >-
+  #     inputs.ARCHITECTURE == 'amd64' &&
+  #     (
+  #       inputs.MODE == 'full' ||
+  #       inputs.MODE == 't5x'
+  #     )
+  #   uses: ./.github/workflows/_test_t5x_rosetta.yaml
+  #   with:
+  #     T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
+
+  # test-maxtext:
+  #   needs: build-maxtext
+  #   if: >-
+  #     inputs.ARCHITECTURE == 'amd64' &&
+  #     (
+  #       inputs.MODE == 'full' ||
+  #       inputs.MODE == 'maxtext'
+  #     )
+  #   uses: ./.github/workflows/_test_maxtext.yaml
+  #   with:
+  #     MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
+
+  # test-maxtext-gke:
+  #   needs: build-maxtext
+  #   if: >-
+  #     inputs.ARCHITECTURE == 'amd64' &&
+  #     (
+  #       inputs.MODE == 'full' ||
+  #       inputs.MODE == 'maxtext'
+  #     )
+  #   uses: ./.github/workflows/_test_maxtext_gke_xpk.yaml
+  #   with:
+  #     MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
+
+  # test-axlearn-eks:
+  #   needs: build-axlearn
+  #   if: >-
+  #     inputs.ARCHITECTURE == 'amd64' &&
+  #     (
+  #       inputs.MODE == 'full' ||
+  #       inputs.MODE == 'axlearn'
+  #     )
+  #   runs-on: eks
+  #   env:
+  #     AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
+  #     JOB_NAME: axlearn-${{ github.run_id }}
+  #   steps:
+  #   - name: Check out the repository
+  #     uses: actions/checkout@v4
+  #   - name: Login to GitHub Container Registry
+  #     uses: docker/login-action@v3
+  #     with:
+  #       registry: ghcr.io
+  #       username: ${{ github.repository_owner }}
+  #       password: ${{ secrets.GITHUB_TOKEN }}
+  #   - name: K8s GHCR store and delete token
+  #     id: store-token
+  #     uses: ./.github/actions/store-delete-k8s-ghcr
+  #   - name: Configure axlearn test job
+  #     run: |
+  #       # Replace placeholders in axlearn-job.yml with environment variables
+  #       yq -i ea '
+  #          select(di == 0).metadata.name = strenv(JOB_NAME)
+  #         | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
+  #         | select(di == 0).spec.template.spec.containers[0].env[0].value = "${{ github.run_id }}"
+  #         | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
+  #       .github/eks-workflow-files/axlearn/axlearn-job.yml
+  #       git diff .github/eks-workflow-files/axlearn/axlearn-job.yml
+  #   - name: Submit & delete axlearn test
+  #     uses: ./.github/actions/submit-delete-k8s-job
+  #     with:
+  #       job-config-file: ".github/eks-workflow-files/axlearn/axlearn-job.yml"
+  #       job-name: ${{ env.JOB_NAME }}
+  #   - name: Download logs from S3
+  #     id: log-s3
+  #     if: ${{ !cancelled() }}
+  #     run: |
+  #       mkdir -p axlearn-output
+  #       aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/summary.txt axlearn-output/
+  #       aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.log"
+  #       aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.xml"
+
+
+  #       passed_tests=$(grep -Eo 'PASSED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' )
+  #       failed_tests=$(grep -Eo 'FAILED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' )
+  #       skipped_tests=$(grep -Eo 'SKIPPED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' )
+  #       total_tests=$((failed_tests + passed_tests + skipped_tests))
+
+  #       echo "Passed tests: $passed_tests"
+  #       echo "Failed tests: $failed_tests"
+  #       echo "Skipped tests: $skipped_tests"
+  #       echo "Total tests: $total_tests"
+  #       echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT
+  #       echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
+  #   - name: Generate sitrep
+  #     id: sitrep
+  #     if: ${{ !cancelled() }}
+  #     shell: bash -x -e {0}
+  #     run: |
+  #       # bring in utility functions
+  #       source .github/workflows/scripts/to_json.sh
+
+  #       badge_label='Axlearn EKS Unit'
+
+  #       total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \
+  #       failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \
+  #       passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \
+  #       errors="0" \
+  #       summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
+  #       badge_message="Passed $passed_tests out of $total_tests." \
+  #       badge_color="brightgreen"
+  #       if [ "$failed_tests" -gt 0 ]; then
+  #         badge_color="red"
+  #       fi \
+
+  #       to_json \
+  #         summary \
+  #         errors total_tests passed_tests failed_tests \
+  #         badge_label badge_color badge_message \
+  #       > sitrep.json
+
+  #       schemaVersion=1 \
+  #       label="${badge_label}" \
+  #       message="Passed $passed_tests out of $total_tests." \
+  #       color=$badge_color \
+  #       to_json schemaVersion label message color \
+  #       > badge-axlearn-test.json
+
+  #   - name: Upload artifacts
+  #     if: ${{ !cancelled() }}
+  #     uses: actions/upload-artifact@v4
+  #     with:
+  #       name: "artifact-axlearn-test"
+  #       path: |
+  #         sitrep.json
+  #         badge-axlearn-test.json
+  #         axlearn-unittests.jsonl
+  #         axlearn-output/*
+
+  # test-axlearn-fuji-models-eks:
+  #   needs: build-axlearn
+  #   if: >-
+  #     inputs.ARCHITECTURE == 'amd64' &&
+  #     (
+  #       inputs.MODE == 'full' ||
+  #       inputs.MODE == 'axlearn'
+  #     )
+  #   runs-on: eks
+  #   env:
+  #     AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
+  #     JOB_NAME: axlearn-fuji-3b-${{ github.run_id }}
+  #   steps:
+  #   - name: Check out the repository
+  #     uses: actions/checkout@v4
+  #   - name: Login to GitHub Container Registry
+  #     uses: docker/login-action@v3
+  #     with:
+  #       registry: ghcr.io
+  #       username: ${{ github.repository_owner }}
+  #       password: ${{ secrets.GITHUB_TOKEN }}
+  #   - name: K8s GHCR store and delete token
+  #     id: store-token
+  #     uses: ./.github/actions/store-delete-k8s-ghcr
+  #   - name: Configure axlearn test job
+  #     run: |
+  #       yq -i ea '
+  #          select(di == 0).metadata.name = strenv(JOB_NAME)
+  #         | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
+  #         | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
+  #       .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
+  #       git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
+
+  #   - name: Submit & delete axlearn fuji model test
+  #     uses: ./.github/actions/submit-delete-k8s-job
+  #     with:
+  #       job-config-file:  ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml"
+  #       job-name: ${{ env.JOB_NAME }}
diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml
index 8dbe3ac7b..3fc671451 100644
--- a/.github/workflows/_test_nccl.yaml
+++ b/.github/workflows/_test_nccl.yaml
@@ -14,11 +14,11 @@ permissions:
   packages: write # to upload container
 
 jobs:
-  nccl-test-gke:
-    uses: ./.github/workflows/_test_nccl_gke.yaml
-    with:
-      JAX_IMAGE: ${{ inputs.CONTAINER }}
-    secrets: inherit
+  # nccl-test-gke:
+  #   uses: ./.github/workflows/_test_nccl_gke.yaml
+  #   with:
+  #     JAX_IMAGE: ${{ inputs.CONTAINER }}
+  #   secrets: inherit
 
   build-mpi-operator-compatible-base:
     runs-on: [self-hosted, "amd64", "large"]
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 1a6f53ec4..2a84fe93d 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -51,7 +51,8 @@ on:
           - t5x - run build rosetta
           - maxtext - run only the tests for maxtext
           - axlearn - run only the tests for axlearn
-        options: [full, jax, te, t5x, maxtext, axlearn]
+          - nccl - run only nccl tests
+        options: [full, jax, te, t5x, maxtext, axlearn, nccl]
         default: full
 
 concurrency:
@@ -210,7 +211,7 @@ jobs:
       CUDA_IMAGE: ${{ needs.metadata.outputs.CUDA_IMAGE }}
       MANIFEST_ARTIFACT_NAME: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }}
       SOURCE_URLREFS: ${{ needs.bump-manifest.outputs.SOURCE_URLREFS }}
-      MODE: ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }}
+      MODE: "nccl" #${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }}
     secrets: inherit
 
   arm64:
@@ -222,7 +223,7 @@ jobs:
       CUDA_IMAGE: ${{ needs.metadata.outputs.CUDA_IMAGE }}
       MANIFEST_ARTIFACT_NAME: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }}
       SOURCE_URLREFS: ${{ needs.bump-manifest.outputs.SOURCE_URLREFS }}
-      MODE: ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }}
+      MODE: "nccl" #${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }}
     secrets: inherit
 
   # Only merge if everything succeeds