diff --git a/.github/eks-workflow-files/mpi-nccl-test.yml b/.github/eks-workflow-files/mpi-nccl-test.yml index 0e34cb7a2..003605148 100644 --- a/.github/eks-workflow-files/mpi-nccl-test.yml +++ b/.github/eks-workflow-files/mpi-nccl-test.yml @@ -26,6 +26,13 @@ spec: - image: PLACEHOLDER imagePullPolicy: IfNotPresent name: PLACEHOLDER + env: + - name: PATH + value: $PATH:/opt/amazon/efa/bin:/usr/bin:/usr/local/mpi/bin + - name: LD_LIBRARY_PATH + value: "/usr/local/mpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:/usr/local/cuda/compat/lib.real:$LD_LIBRARY_PATH" + - name: NCCL_TUNER_PLUGIN + value: /opt/aws-ofi-nccl/install/lib/libnccl-ofi-tuner.so command: - bash - -c @@ -41,8 +48,8 @@ spec: echo "Workers were still not reachable after ${limit}, exiting" exit 1 fi - mpirun --allow-run-as-root --tag-output -N 1 -x LD_LIBRARY_PATH=/usr/local/cuda/compat/lib nvidia-smi - mpirun --allow-run-as-root -N 8 -x LD_LIBRARY_PATH=/usr/local/cuda/compat/lib $0 \ + mpirun --allow-run-as-root --tag-output -N 1 -x LD_LIBRARY_PATH=${LD_LIBRARY_PATH} nvidia-smi + mpirun --allow-run-as-root -N 8 -x ${PATH} -x LD_LIBRARY_PATH=${LD_LIBRARY_PATH} -x NVIDIA_GDRCOPY=enabled -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_LOG_LEVEL=info -x NCCL_DEBUG=info -x NCCL_TUNER_PLUGIN=${NCCL_TUNER_PLUGIN} $0 \ -b 8 \ -e 16G \ -f 2 \ @@ -65,6 +72,7 @@ spec: - image: PLACEHOLDER imagePullPolicy: IfNotPresent name: PLACEHOLDER + env: volumeMounts: - name: shmem mountPath: /dev/shm diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index aafc32fdb..45a785b34 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -54,753 +54,758 @@ jobs: secrets: inherit test-nccl: - if: inputs.ARCHITECTURE == 'amd64' # build only amd64 - needs: build-base - uses: ./.github/workflows/_test_nccl.yaml - with: - CONTAINER: ${{ needs.build-base.outputs.DOCKER_TAG }} - secrets: inherit - - build-jax: - needs: build-base - runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "large"] - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Build JAX container - id: build-jax - uses: ./.github/actions/build-container - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-jax-build - BADGE_FILENAME: badge-jax-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} - CONTAINER_NAME: jax - DOCKERFILE: .github/container/Dockerfile.jax - RUNNER_SIZE: large - ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} - github-token: ${{ secrets.GITHUB_TOKEN }} - bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} - EXTRA_BUILD_ARGS: | - URLREF_JAX=${{ fromJson(inputs.SOURCE_URLREFS).JAX }} - URLREF_XLA=${{ fromJson(inputs.SOURCE_URLREFS).XLA }} - URLREF_FLAX=${{ fromJson(inputs.SOURCE_URLREFS).FLAX }} - URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }} - outputs: - DOCKER_TAG_MEALKIT: ${{ steps.build-jax.outputs.DOCKER_TAG_MEALKIT }} - DOCKER_TAG_FINAL: ${{ steps.build-jax.outputs.DOCKER_TAG_FINAL }} - - build-equinox: - needs: build-jax - runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"] - outputs: - DOCKER_TAG_MEALKIT: ${{ steps.build-equinox.outputs.DOCKER_TAG_MEALKIT }} - DOCKER_TAG_FINAL: ${{ steps.build-equinox.outputs.DOCKER_TAG_FINAL }} - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Build Equinox container - id: build-equinox - uses: ./.github/actions/build-container - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-equinox-build - BADGE_FILENAME: badge-equinox-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: equinox - DOCKERFILE: .github/container/Dockerfile.equinox - RUNNER_SIZE: small - ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} - github-token: ${{ secrets.GITHUB_TOKEN }} - bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} - EXTRA_BUILD_ARGS: | - URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} - - build-maxtext: - needs: build-jax - runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"] - outputs: - DOCKER_TAG_MEALKIT: ${{ steps.build-maxtext.outputs.DOCKER_TAG_MEALKIT }} - DOCKER_TAG_FINAL: ${{ steps.build-maxtext.outputs.DOCKER_TAG_FINAL }} - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Build MaxText container - id: build-maxtext - uses: ./.github/actions/build-container - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-maxtext-build - BADGE_FILENAME: badge-maxtext-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: maxtext - DOCKERFILE: .github/container/Dockerfile.maxtext - RUNNER_SIZE: small - ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} - github-token: ${{ secrets.GITHUB_TOKEN }} - bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} - EXTRA_BUILD_ARGS: | - URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} - URLREF_JETSTREAM=${{ fromJson(inputs.SOURCE_URLREFS).GOOGLE_JETSTREAM }} - - build-upstream-t5x: - needs: build-jax - runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"] - outputs: - DOCKER_TAG_MEALKIT: ${{ steps.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} - DOCKER_TAG_FINAL: ${{ steps.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} - steps: - - id: checkout - name: Checkout repository - uses: actions/checkout@v4 - - name: Build Upstream T5X container - id: build-upstream-t5x - uses: ./.github/actions/build-container - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-t5x-build - BADGE_FILENAME: badge-t5x-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: upstream-t5x - DOCKERFILE: .github/container/Dockerfile.t5x - RUNNER_SIZE: small - ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} - github-token: ${{ secrets.GITHUB_TOKEN }} - bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} - EXTRA_BUILD_ARGS: | - URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} - URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} - - build-axlearn: - needs: build-jax - runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "large"] - outputs: - DOCKER_TAG_MEALKIT: ${{ steps.build-axlearn.outputs.DOCKER_TAG_MEALKIT }} - DOCKER_TAG_FINAL: ${{ steps.build-axlearn.outputs.DOCKER_TAG_FINAL }} - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Build AxLearn container - id: build-axlearn - uses: ./.github/actions/build-container - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-axlearn-build - BADGE_FILENAME: badge-axlearn-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: axlearn - DOCKERFILE: .github/container/Dockerfile.axlearn - RUNNER_SIZE: large - ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} - github-token: ${{ secrets.GITHUB_TOKEN }} - bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} - EXTRA_BUILD_ARGS: | - URLREF_AXLEARN=${{ fromJson(inputs.SOURCE_URLREFS).AXLEARN }} - - build-rosetta-t5x: - needs: build-upstream-t5x - uses: ./.github/workflows/_build_rosetta.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} - BASE_LIBRARY: t5x - secrets: inherit - - collect-docker-tags: - runs-on: ubuntu-22.04 - if: ${{ !cancelled() }} - needs: - - build-base - - build-jax - - build-equinox - - build-maxtext - - build-upstream-t5x - - build-axlearn - - build-rosetta-t5x - outputs: - TAGS: ${{ steps.collect-tags.outputs.TAGS }} - steps: - - name: Save docker tags as a JSON object - id: collect-tags - run: | - TAGS=$(cat <> $GITHUB_OUTPUT - - test-distribution: - runs-on: ubuntu-22.04 - strategy: - matrix: - TEST_SCRIPT: - - extra-only-distribution.sh - - mirror-only-distribution.sh - - upstream-only-distribution.sh - - local-patch-distribution.sh - fail-fast: false - steps: - - name: Print environment variables - run: env - - name: Set git login for tests - run: | - git config --global user.email "jax@nvidia.com" - git config --global user.name "JAX-Toolbox CI" - - name: Check out the repository under ${GITHUB_WORKSPACE} - uses: actions/checkout@v4 - - name: Run integration test ${{ matrix.TEST_SCRIPT }} - run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} - - test-jax: - needs: build-jax if: >- inputs.ARCHITECTURE == 'amd64' && ( inputs.MODE == 'full' || - inputs.MODE == 'jax' + inputs.MODE == 'nccl' ) - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: jax - EXECUTE: | - docker run -i --shm-size=1g --gpus all \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-backend-independent.log - test-jax.sh -b backend-independent - EOF - docker run -i --shm-size=1g --gpus all \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-single-gpu.log - nvidia-cuda-mps-control -d - test-jax.sh -b single-gpu - EOF - docker run -i --shm-size=1g --gpus all \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-multi-gpu.log - nvidia-cuda-mps-control -d - test-jax.sh -b multi-gpu - EOF - STATISTICS_SCRIPT: | - errors=$(cat test-*.log | grep -c 'ERROR:' || true) - failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) - passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-backend-independent.log - test-multi-gpu.log - test-single-gpu.log - secrets: inherit - - test-nsys-jax: - needs: build-jax - if: >- - inputs.ARCHITECTURE == 'amd64' && - ( - inputs.MODE == 'full' || - inputs.MODE == 'jax' - ) - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: nsys-jax - EXECUTE: | - set -o pipefail - num_tests=0 - num_failures=0 - # Run the pytest-driven tests; failure is explicitly handled below so set +e to - # avoid an early abort here. - set +e - docker run -i --shm-size=1g --gpus all \ - -v $PWD:/opt/output \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-nsys-jax.log - # nsys-jax is already installed, this is just adding the test dependencies - pip install pytest-reportlog nsys-jax[test] - # abuse knowledge that nsys-jax is installed editable, so the tests exist - test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') - pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}" - EOF - set -e - GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') - for mode in 1-process 2-process process-per-gpu; do - DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}" - if [[ "${mode}" == "1-process" ]]; then - PROCESS_COUNT=1 - ARGS="" - elif [[ "${mode}" == "2-process" ]]; then - # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that - # this will flush out more bugs than process-per-node or process-per-GPU. - PROCESS_COUNT=2 - ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed" - else - PROCESS_COUNT=${GPUS_PER_NODE} - ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed" - fi - for collection in full partial; do - NSYS_JAX="nsys-jax" - if [[ "${mode}" == "1-process" ]]; then - # We will not run nsys-jax-combine, so run analyses eagerly - NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary" - fi - NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}" - if [[ "${collection}" == "partial" ]]; then - NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop" - # nvbug/4801401 - NSYS_JAX+=" --sample=none" - fi - set +e - ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \ - -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log - num_failures=$((num_failures + ($? != 0))) - set -e - num_tests=$((num_tests + 1)) - done - if [[ "${mode}" != "1-process" ]]; then - # Run nsys-jax-combine - NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip" - for (( i=0; i> $GITHUB_ENV - echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV - exit $num_failures - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-nsys-jax.log) - num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) - failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) - total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests )) - num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT )) - num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT )) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT - ARTIFACTS: | - # pytest-driven part - test-nsys-jax.log - pytest-report.jsonl - # nsys-jax logfiles - *process-*-execution.log - # nsys-jax output for the case that doesn't use nsys-jax-combine - 1-process-*-execution-0.zip - # nsys-jax-combine output/logfiles - *process*-*-execution.zip - *-execution-combine.log - secrets: inherit - - # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test - # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does - # not already have nsys-jax installed - test-nsys-jax-archive: - needs: test-nsys-jax - if: >- - inputs.ARCHITECTURE == 'amd64' && - ( - inputs.MODE == 'full' || - inputs.MODE == 'jax' - ) - strategy: - matrix: - os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] - runs-on: ${{ matrix.os }} - steps: - - name: Download nsys-jax output .zip files - uses: actions/download-artifact@v4 - with: - name: nsys-jax-unit-test-A100 - - name: Extract archives and execute install scripts - run: | - pip install virtualenv # for install.sh - for zip in $(ls *.zip); do - ZIP="${PWD}/${zip}" - pushd $(mktemp -d) - unzip "${ZIP}" - ls -l - # TODO: verify this isn't needed, or make sure it isn't needed - chmod 755 install.sh - # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout - # Skip executing Jupyter lab - NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh - popd - done - - test-nsys-jax-eks: - needs: build-jax - if: >- - inputs.ARCHITECTURE == 'amd64' && - ( - inputs.MODE == 'full' || - inputs.MODE == 'jax' - ) - runs-on: eks - env: - JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} - JOB_NAME: ${{ github.run_id }}-nsys-jax - # Service name cannot start with a number - SERVICE_NAME: svc-${{ github.run_id}}-nsys-jax - POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess - steps: - - name: Check out the repository - uses: actions/checkout@v4 - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: K8s GHCR store and delete token - id: store-token - uses: ./.github/actions/store-delete-k8s-ghcr - - name: Configure Kubernetes job - run: | - yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) - | select(di == 0).metadata.name = strenv(SERVICE_NAME) - | select(di == 1).metadata.name = strenv(JOB_NAME) - | select(di == 1).spec.template.spec.subdomain = strenv(SERVICE_NAME) - | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" - | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) - | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME) - | select(di == 1).spec.template.spec.containers[0].env[1].value = strenv(SERVICE_NAME)' \ - .github/eks-workflow-files/job.yml - git diff .github/eks-workflow-files/job.yml - - name: Submit Kubernetes job - uses: ./.github/actions/submit-delete-k8s-job - with: - job-config-file: .github/eks-workflow-files/job.yml - job-name: ${{ env.JOB_NAME }} - - name: Configure post-processing job - run: | - export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" - yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) - | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) - | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" - | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ - .github/eks-workflow-files/post-process-job.yml - git diff .github/eks-workflow-files/post-process-job.yml - - name: Submit post process Kubernetes job - uses: ./.github/actions/submit-delete-k8s-job - with: - job-config-file: .github/eks-workflow-files/post-process-job.yml - job-name: ${{ env.POSTPROCESS_JOB_NAME }} - - test-te-h100: - needs: build-jax - if: >- - inputs.ARCHITECTURE == 'amd64' && - ( - inputs.MODE == 'full' || - inputs.MODE == 'te' - ) - uses: ./.github/workflows/_transformer_engine_eks.yaml - with: - JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} - JOB_NAME: transformerengine-${{ github.run_id }} - S3_BUCKET: jax-toolbox-eks-output - CI_NAME: transformer-engine - secrets: inherit - - test-jax-cutlass-h100: - needs: build-jax - if: >- - inputs.ARCHITECTURE == 'amd64' && - ( - inputs.MODE == 'full' || - inputs.MODE == 'jax-cutlass' - ) - uses: ./.github/workflows/_jax_cutlass_eks.yaml - with: - JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} - JOB_NAME: jax-cutlass-${{ github.run_id }} - S3_BUCKET: jax-toolbox-eks-output - CI_NAME: jax-cutlass - secrets: inherit - - test-te-a100: - needs: build-jax - secrets: inherit - if: >- - inputs.ARCHITECTURE == 'amd64' && - ( - inputs.MODE == 'full' || - inputs.MODE == 'te' - ) - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: te - EXECUTE: | - docker run -i --gpus all --shm-size=1g -v $PWD:/log \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-te.log - set -xu -o pipefail - - LOG_DIR=/log - - pip install pytest-reportlog pytest-xdist - # Start MPS daemon - nvidia-cuda-mps-control -d - # TE's default is slightly different, without the hyphen - export TE_PATH=${SRC_PATH_TRANSFORMER_ENGINE} - # 1 GPU per worker, 3 workers per GPU - pytest-xdist.sh 1 3 ${LOG_DIR}/pytest-report-L0-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_unittest/test.sh - ## 8 GPUs per worker, 1 worker per GPU. pytest-xdist.sh allows aggregation - ## into a single .jsonl file of results from multiple pytest invocations - ## inside the test.sh script, so it's useful even with a single worker per - ## device. - pytest-xdist.sh 8 1 ${LOG_DIR}/pytest-report-L0-distributed-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_distributed_unittest/test.sh - - # merge the log files - cat \ - ${LOG_DIR}/pytest-report-L0-unittest.jsonl \ - ${LOG_DIR}/pytest-report-L0-distributed-unittest.jsonl \ - > ${LOG_DIR}/pytest-report.jsonl - - EOF - STATISTICS_SCRIPT: | - report_json=pytest-report.jsonl - summary_line=$(tail -n1 test-te.log) - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(cat $report_json | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) - failed_tests=$(cat $report_json | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - - echo "$failed_tests tests failed" - if [[ $failed_tests -gt 0 ]]; then - exit 1 - else - exit 0 - fi - - TIMEOUT_MINUTES: 120 - ARTIFACTS: | - test-te.log - pytest-report.jsonl - pytest-report-L0-unittest.jsonl - pytest-report-L0-distributed-unittest.jsonl - - test-rosetta-t5x: - needs: build-rosetta-t5x - if: >- - inputs.ARCHITECTURE == 'amd64' && - ( - inputs.MODE == 'full' || - inputs.MODE == 't5x' - ) - uses: ./.github/workflows/_test_t5x_rosetta.yaml - with: - T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} - secrets: inherit - - test-maxtext: - needs: build-maxtext - if: >- - inputs.ARCHITECTURE == 'amd64' && - ( - inputs.MODE == 'full' || - inputs.MODE == 'maxtext' - ) - uses: ./.github/workflows/_test_maxtext.yaml - with: - MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} - secrets: inherit - - test-maxtext-gke: - needs: build-maxtext - if: >- - inputs.ARCHITECTURE == 'amd64' && - ( - inputs.MODE == 'full' || - inputs.MODE == 'maxtext' - ) - uses: ./.github/workflows/_test_maxtext_gke_xpk.yaml + needs: build-base + uses: ./.github/workflows/_test_nccl.yaml with: - MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} + CONTAINER: ${{ needs.build-base.outputs.DOCKER_TAG }} secrets: inherit - test-axlearn-eks: - needs: build-axlearn - if: >- - inputs.ARCHITECTURE == 'amd64' && - ( - inputs.MODE == 'full' || - inputs.MODE == 'axlearn' - ) - runs-on: eks - env: - AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} - JOB_NAME: axlearn-${{ github.run_id }} - steps: - - name: Check out the repository - uses: actions/checkout@v4 - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: K8s GHCR store and delete token - id: store-token - uses: ./.github/actions/store-delete-k8s-ghcr - - name: Configure axlearn test job - run: | - # Replace placeholders in axlearn-job.yml with environment variables - yq -i ea ' - select(di == 0).metadata.name = strenv(JOB_NAME) - | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) - | select(di == 0).spec.template.spec.containers[0].env[0].value = "${{ github.run_id }}" - | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \ - .github/eks-workflow-files/axlearn/axlearn-job.yml - git diff .github/eks-workflow-files/axlearn/axlearn-job.yml - - name: Submit & delete axlearn test - uses: ./.github/actions/submit-delete-k8s-job - with: - job-config-file: ".github/eks-workflow-files/axlearn/axlearn-job.yml" - job-name: ${{ env.JOB_NAME }} - - name: Download logs from S3 - id: log-s3 - if: ${{ !cancelled() }} - run: | - mkdir -p axlearn-output - aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/summary.txt axlearn-output/ - aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.log" - aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.xml" - - - passed_tests=$(grep -Eo 'PASSED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' ) - failed_tests=$(grep -Eo 'FAILED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' ) - skipped_tests=$(grep -Eo 'SKIPPED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' ) - total_tests=$((failed_tests + passed_tests + skipped_tests)) - - echo "Passed tests: $passed_tests" - echo "Failed tests: $failed_tests" - echo "Skipped tests: $skipped_tests" - echo "Total tests: $total_tests" - echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT - echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT - - name: Generate sitrep - id: sitrep - if: ${{ !cancelled() }} - shell: bash -x -e {0} - run: | - # bring in utility functions - source .github/workflows/scripts/to_json.sh - - badge_label='Axlearn EKS Unit' - - total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \ - failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \ - passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \ - errors="0" \ - summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \ - badge_message="Passed $passed_tests out of $total_tests." \ - badge_color="brightgreen" - if [ "$failed_tests" -gt 0 ]; then - badge_color="red" - fi \ - - to_json \ - summary \ - errors total_tests passed_tests failed_tests \ - badge_label badge_color badge_message \ - > sitrep.json - - schemaVersion=1 \ - label="${badge_label}" \ - message="Passed $passed_tests out of $total_tests." \ - color=$badge_color \ - to_json schemaVersion label message color \ - > badge-axlearn-test.json - - - name: Upload artifacts - if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 - with: - name: "artifact-axlearn-test" - path: | - sitrep.json - badge-axlearn-test.json - axlearn-unittests.jsonl - axlearn-output/* - - test-axlearn-fuji-models-eks: - needs: build-axlearn - if: >- - inputs.ARCHITECTURE == 'amd64' && - ( - inputs.MODE == 'full' || - inputs.MODE == 'axlearn' - ) - runs-on: eks - env: - AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} - JOB_NAME: axlearn-fuji-3b-${{ github.run_id }} - steps: - - name: Check out the repository - uses: actions/checkout@v4 - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: K8s GHCR store and delete token - id: store-token - uses: ./.github/actions/store-delete-k8s-ghcr - - name: Configure axlearn test job - run: | - yq -i ea ' - select(di == 0).metadata.name = strenv(JOB_NAME) - | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) - | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \ - .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml - git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml - - - name: Submit & delete axlearn fuji model test - uses: ./.github/actions/submit-delete-k8s-job - with: - job-config-file: ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml" - job-name: ${{ env.JOB_NAME }} + # build-jax: + # needs: build-base + # runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "large"] + # steps: + # - name: Checkout repository + # uses: actions/checkout@v4 + # - name: Build JAX container + # id: build-jax + # uses: ./.github/actions/build-container + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-jax-build + # BADGE_FILENAME: badge-jax-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} + # CONTAINER_NAME: jax + # DOCKERFILE: .github/container/Dockerfile.jax + # RUNNER_SIZE: large + # ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + # ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} + # github-token: ${{ secrets.GITHUB_TOKEN }} + # bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} + # EXTRA_BUILD_ARGS: | + # URLREF_JAX=${{ fromJson(inputs.SOURCE_URLREFS).JAX }} + # URLREF_XLA=${{ fromJson(inputs.SOURCE_URLREFS).XLA }} + # URLREF_FLAX=${{ fromJson(inputs.SOURCE_URLREFS).FLAX }} + # URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }} + # outputs: + # DOCKER_TAG_MEALKIT: ${{ steps.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # DOCKER_TAG_FINAL: ${{ steps.build-jax.outputs.DOCKER_TAG_FINAL }} + + # build-equinox: + # needs: build-jax + # runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"] + # outputs: + # DOCKER_TAG_MEALKIT: ${{ steps.build-equinox.outputs.DOCKER_TAG_MEALKIT }} + # DOCKER_TAG_FINAL: ${{ steps.build-equinox.outputs.DOCKER_TAG_FINAL }} + # steps: + # - name: Checkout repository + # uses: actions/checkout@v4 + # - name: Build Equinox container + # id: build-equinox + # uses: ./.github/actions/build-container + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-equinox-build + # BADGE_FILENAME: badge-equinox-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: equinox + # DOCKERFILE: .github/container/Dockerfile.equinox + # RUNNER_SIZE: small + # ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + # ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} + # github-token: ${{ secrets.GITHUB_TOKEN }} + # bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} + # EXTRA_BUILD_ARGS: | + # URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} + + # build-maxtext: + # needs: build-jax + # runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"] + # outputs: + # DOCKER_TAG_MEALKIT: ${{ steps.build-maxtext.outputs.DOCKER_TAG_MEALKIT }} + # DOCKER_TAG_FINAL: ${{ steps.build-maxtext.outputs.DOCKER_TAG_FINAL }} + # steps: + # - name: Checkout repository + # uses: actions/checkout@v4 + # - name: Build MaxText container + # id: build-maxtext + # uses: ./.github/actions/build-container + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-maxtext-build + # BADGE_FILENAME: badge-maxtext-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: maxtext + # DOCKERFILE: .github/container/Dockerfile.maxtext + # RUNNER_SIZE: small + # ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + # ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} + # github-token: ${{ secrets.GITHUB_TOKEN }} + # bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} + # EXTRA_BUILD_ARGS: | + # URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} + # URLREF_JETSTREAM=${{ fromJson(inputs.SOURCE_URLREFS).GOOGLE_JETSTREAM }} + + # build-upstream-t5x: + # needs: build-jax + # runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"] + # outputs: + # DOCKER_TAG_MEALKIT: ${{ steps.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} + # DOCKER_TAG_FINAL: ${{ steps.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} + # steps: + # - id: checkout + # name: Checkout repository + # uses: actions/checkout@v4 + # - name: Build Upstream T5X container + # id: build-upstream-t5x + # uses: ./.github/actions/build-container + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-t5x-build + # BADGE_FILENAME: badge-t5x-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: upstream-t5x + # DOCKERFILE: .github/container/Dockerfile.t5x + # RUNNER_SIZE: small + # ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + # ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} + # github-token: ${{ secrets.GITHUB_TOKEN }} + # bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} + # EXTRA_BUILD_ARGS: | + # URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} + # URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} + + # build-axlearn: + # needs: build-jax + # runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "large"] + # outputs: + # DOCKER_TAG_MEALKIT: ${{ steps.build-axlearn.outputs.DOCKER_TAG_MEALKIT }} + # DOCKER_TAG_FINAL: ${{ steps.build-axlearn.outputs.DOCKER_TAG_FINAL }} + # steps: + # - name: Checkout repository + # uses: actions/checkout@v4 + # - name: Build AxLearn container + # id: build-axlearn + # uses: ./.github/actions/build-container + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-axlearn-build + # BADGE_FILENAME: badge-axlearn-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: axlearn + # DOCKERFILE: .github/container/Dockerfile.axlearn + # RUNNER_SIZE: large + # ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + # ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} + # github-token: ${{ secrets.GITHUB_TOKEN }} + # bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} + # EXTRA_BUILD_ARGS: | + # URLREF_AXLEARN=${{ fromJson(inputs.SOURCE_URLREFS).AXLEARN }} + + # build-rosetta-t5x: + # needs: build-upstream-t5x + # uses: ./.github/workflows/_build_rosetta.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} + # BASE_LIBRARY: t5x + # secrets: inherit + + # collect-docker-tags: + # runs-on: ubuntu-22.04 + # if: ${{ !cancelled() }} + # needs: + # - build-base + # - build-jax + # - build-equinox + # - build-maxtext + # - build-upstream-t5x + # - build-axlearn + # - build-rosetta-t5x + # outputs: + # TAGS: ${{ steps.collect-tags.outputs.TAGS }} + # steps: + # - name: Save docker tags as a JSON object + # id: collect-tags + # run: | + # TAGS=$(cat <> $GITHUB_OUTPUT + + # test-distribution: + # runs-on: ubuntu-22.04 + # strategy: + # matrix: + # TEST_SCRIPT: + # - extra-only-distribution.sh + # - mirror-only-distribution.sh + # - upstream-only-distribution.sh + # - local-patch-distribution.sh + # fail-fast: false + # steps: + # - name: Print environment variables + # run: env + # - name: Set git login for tests + # run: | + # git config --global user.email "jax@nvidia.com" + # git config --global user.name "JAX-Toolbox CI" + # - name: Check out the repository under ${GITHUB_WORKSPACE} + # uses: actions/checkout@v4 + # - name: Run integration test ${{ matrix.TEST_SCRIPT }} + # run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} + + # test-jax: + # needs: build-jax + # if: >- + # inputs.ARCHITECTURE == 'amd64' && + # ( + # inputs.MODE == 'full' || + # inputs.MODE == 'jax' + # ) + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: jax + # EXECUTE: | + # docker run -i --shm-size=1g --gpus all \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-backend-independent.log + # test-jax.sh -b backend-independent + # EOF + # docker run -i --shm-size=1g --gpus all \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-single-gpu.log + # nvidia-cuda-mps-control -d + # test-jax.sh -b single-gpu + # EOF + # docker run -i --shm-size=1g --gpus all \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-multi-gpu.log + # nvidia-cuda-mps-control -d + # test-jax.sh -b multi-gpu + # EOF + # STATISTICS_SCRIPT: | + # errors=$(cat test-*.log | grep -c 'ERROR:' || true) + # failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) + # passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-backend-independent.log + # test-multi-gpu.log + # test-single-gpu.log + # secrets: inherit + + # test-nsys-jax: + # needs: build-jax + # if: >- + # inputs.ARCHITECTURE == 'amd64' && + # ( + # inputs.MODE == 'full' || + # inputs.MODE == 'jax' + # ) + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: nsys-jax + # EXECUTE: | + # set -o pipefail + # num_tests=0 + # num_failures=0 + # # Run the pytest-driven tests; failure is explicitly handled below so set +e to + # # avoid an early abort here. + # set +e + # docker run -i --shm-size=1g --gpus all \ + # -v $PWD:/opt/output \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-nsys-jax.log + # # nsys-jax is already installed, this is just adding the test dependencies + # pip install pytest-reportlog nsys-jax[test] + # # abuse knowledge that nsys-jax is installed editable, so the tests exist + # test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') + # pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}" + # EOF + # set -e + # GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') + # for mode in 1-process 2-process process-per-gpu; do + # DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}" + # if [[ "${mode}" == "1-process" ]]; then + # PROCESS_COUNT=1 + # ARGS="" + # elif [[ "${mode}" == "2-process" ]]; then + # # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that + # # this will flush out more bugs than process-per-node or process-per-GPU. + # PROCESS_COUNT=2 + # ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed" + # else + # PROCESS_COUNT=${GPUS_PER_NODE} + # ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed" + # fi + # for collection in full partial; do + # NSYS_JAX="nsys-jax" + # if [[ "${mode}" == "1-process" ]]; then + # # We will not run nsys-jax-combine, so run analyses eagerly + # NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary" + # fi + # NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}" + # if [[ "${collection}" == "partial" ]]; then + # NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop" + # # nvbug/4801401 + # NSYS_JAX+=" --sample=none" + # fi + # set +e + # ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \ + # -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log + # num_failures=$((num_failures + ($? != 0))) + # set -e + # num_tests=$((num_tests + 1)) + # done + # if [[ "${mode}" != "1-process" ]]; then + # # Run nsys-jax-combine + # NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip" + # for (( i=0; i> $GITHUB_ENV + # echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV + # exit $num_failures + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-nsys-jax.log) + # num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) + # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + # total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests )) + # num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT )) + # num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT )) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # # pytest-driven part + # test-nsys-jax.log + # pytest-report.jsonl + # # nsys-jax logfiles + # *process-*-execution.log + # # nsys-jax output for the case that doesn't use nsys-jax-combine + # 1-process-*-execution-0.zip + # # nsys-jax-combine output/logfiles + # *process*-*-execution.zip + # *-execution-combine.log + # secrets: inherit + + # # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test + # # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does + # # not already have nsys-jax installed + # test-nsys-jax-archive: + # needs: test-nsys-jax + # if: >- + # inputs.ARCHITECTURE == 'amd64' && + # ( + # inputs.MODE == 'full' || + # inputs.MODE == 'jax' + # ) + # strategy: + # matrix: + # os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] + # runs-on: ${{ matrix.os }} + # steps: + # - name: Download nsys-jax output .zip files + # uses: actions/download-artifact@v4 + # with: + # name: nsys-jax-unit-test-A100 + # - name: Extract archives and execute install scripts + # run: | + # pip install virtualenv # for install.sh + # for zip in $(ls *.zip); do + # ZIP="${PWD}/${zip}" + # pushd $(mktemp -d) + # unzip "${ZIP}" + # ls -l + # # TODO: verify this isn't needed, or make sure it isn't needed + # chmod 755 install.sh + # # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout + # # Skip executing Jupyter lab + # NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh + # popd + # done + + # test-nsys-jax-eks: + # needs: build-jax + # if: >- + # inputs.ARCHITECTURE == 'amd64' && + # ( + # inputs.MODE == 'full' || + # inputs.MODE == 'jax' + # ) + # runs-on: eks + # env: + # JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} + # JOB_NAME: ${{ github.run_id }}-nsys-jax + # # Service name cannot start with a number + # SERVICE_NAME: svc-${{ github.run_id}}-nsys-jax + # POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess + # steps: + # - name: Check out the repository + # uses: actions/checkout@v4 + # - name: Login to GitHub Container Registry + # uses: docker/login-action@v3 + # with: + # registry: ghcr.io + # username: ${{ github.repository_owner }} + # password: ${{ secrets.GITHUB_TOKEN }} + # - name: K8s GHCR store and delete token + # id: store-token + # uses: ./.github/actions/store-delete-k8s-ghcr + # - name: Configure Kubernetes job + # run: | + # yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) + # | select(di == 0).metadata.name = strenv(SERVICE_NAME) + # | select(di == 1).metadata.name = strenv(JOB_NAME) + # | select(di == 1).spec.template.spec.subdomain = strenv(SERVICE_NAME) + # | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" + # | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) + # | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME) + # | select(di == 1).spec.template.spec.containers[0].env[1].value = strenv(SERVICE_NAME)' \ + # .github/eks-workflow-files/job.yml + # git diff .github/eks-workflow-files/job.yml + # - name: Submit Kubernetes job + # uses: ./.github/actions/submit-delete-k8s-job + # with: + # job-config-file: .github/eks-workflow-files/job.yml + # job-name: ${{ env.JOB_NAME }} + # - name: Configure post-processing job + # run: | + # export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" + # yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) + # | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) + # | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" + # | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ + # .github/eks-workflow-files/post-process-job.yml + # git diff .github/eks-workflow-files/post-process-job.yml + # - name: Submit post process Kubernetes job + # uses: ./.github/actions/submit-delete-k8s-job + # with: + # job-config-file: .github/eks-workflow-files/post-process-job.yml + # job-name: ${{ env.POSTPROCESS_JOB_NAME }} + + # test-te-h100: + # needs: build-jax + # if: >- + # inputs.ARCHITECTURE == 'amd64' && + # ( + # inputs.MODE == 'full' || + # inputs.MODE == 'te' + # ) + # uses: ./.github/workflows/_transformer_engine_eks.yaml + # with: + # JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} + # JOB_NAME: transformerengine-${{ github.run_id }} + # S3_BUCKET: jax-toolbox-eks-output + # CI_NAME: transformer-engine + # secrets: inherit + + # test-jax-cutlass-h100: + # needs: build-jax + # if: >- + # inputs.ARCHITECTURE == 'amd64' && + # ( + # inputs.MODE == 'full' || + # inputs.MODE == 'jax-cutlass' + # ) + # uses: ./.github/workflows/_jax_cutlass_eks.yaml + # with: + # JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} + # JOB_NAME: jax-cutlass-${{ github.run_id }} + # S3_BUCKET: jax-toolbox-eks-output + # CI_NAME: jax-cutlass + # secrets: inherit + + # test-te-a100: + # needs: build-jax + # secrets: inherit + # if: >- + # inputs.ARCHITECTURE == 'amd64' && + # ( + # inputs.MODE == 'full' || + # inputs.MODE == 'te' + # ) + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: te + # EXECUTE: | + # docker run -i --gpus all --shm-size=1g -v $PWD:/log \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-te.log + # set -xu -o pipefail + + # LOG_DIR=/log + + # pip install pytest-reportlog pytest-xdist + # # Start MPS daemon + # nvidia-cuda-mps-control -d + # # TE's default is slightly different, without the hyphen + # export TE_PATH=${SRC_PATH_TRANSFORMER_ENGINE} + # # 1 GPU per worker, 3 workers per GPU + # pytest-xdist.sh 1 3 ${LOG_DIR}/pytest-report-L0-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_unittest/test.sh + # ## 8 GPUs per worker, 1 worker per GPU. pytest-xdist.sh allows aggregation + # ## into a single .jsonl file of results from multiple pytest invocations + # ## inside the test.sh script, so it's useful even with a single worker per + # ## device. + # pytest-xdist.sh 8 1 ${LOG_DIR}/pytest-report-L0-distributed-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_distributed_unittest/test.sh + + # # merge the log files + # cat \ + # ${LOG_DIR}/pytest-report-L0-unittest.jsonl \ + # ${LOG_DIR}/pytest-report-L0-distributed-unittest.jsonl \ + # > ${LOG_DIR}/pytest-report.jsonl + + # EOF + # STATISTICS_SCRIPT: | + # report_json=pytest-report.jsonl + # summary_line=$(tail -n1 test-te.log) + # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(cat $report_json | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) + # failed_tests=$(cat $report_json | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + + # echo "$failed_tests tests failed" + # if [[ $failed_tests -gt 0 ]]; then + # exit 1 + # else + # exit 0 + # fi + + # TIMEOUT_MINUTES: 120 + # ARTIFACTS: | + # test-te.log + # pytest-report.jsonl + # pytest-report-L0-unittest.jsonl + # pytest-report-L0-distributed-unittest.jsonl + + # test-rosetta-t5x: + # needs: build-rosetta-t5x + # if: >- + # inputs.ARCHITECTURE == 'amd64' && + # ( + # inputs.MODE == 'full' || + # inputs.MODE == 't5x' + # ) + # uses: ./.github/workflows/_test_t5x_rosetta.yaml + # with: + # T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit + + # test-maxtext: + # needs: build-maxtext + # if: >- + # inputs.ARCHITECTURE == 'amd64' && + # ( + # inputs.MODE == 'full' || + # inputs.MODE == 'maxtext' + # ) + # uses: ./.github/workflows/_test_maxtext.yaml + # with: + # MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit + + # test-maxtext-gke: + # needs: build-maxtext + # if: >- + # inputs.ARCHITECTURE == 'amd64' && + # ( + # inputs.MODE == 'full' || + # inputs.MODE == 'maxtext' + # ) + # uses: ./.github/workflows/_test_maxtext_gke_xpk.yaml + # with: + # MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit + + # test-axlearn-eks: + # needs: build-axlearn + # if: >- + # inputs.ARCHITECTURE == 'amd64' && + # ( + # inputs.MODE == 'full' || + # inputs.MODE == 'axlearn' + # ) + # runs-on: eks + # env: + # AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} + # JOB_NAME: axlearn-${{ github.run_id }} + # steps: + # - name: Check out the repository + # uses: actions/checkout@v4 + # - name: Login to GitHub Container Registry + # uses: docker/login-action@v3 + # with: + # registry: ghcr.io + # username: ${{ github.repository_owner }} + # password: ${{ secrets.GITHUB_TOKEN }} + # - name: K8s GHCR store and delete token + # id: store-token + # uses: ./.github/actions/store-delete-k8s-ghcr + # - name: Configure axlearn test job + # run: | + # # Replace placeholders in axlearn-job.yml with environment variables + # yq -i ea ' + # select(di == 0).metadata.name = strenv(JOB_NAME) + # | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) + # | select(di == 0).spec.template.spec.containers[0].env[0].value = "${{ github.run_id }}" + # | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \ + # .github/eks-workflow-files/axlearn/axlearn-job.yml + # git diff .github/eks-workflow-files/axlearn/axlearn-job.yml + # - name: Submit & delete axlearn test + # uses: ./.github/actions/submit-delete-k8s-job + # with: + # job-config-file: ".github/eks-workflow-files/axlearn/axlearn-job.yml" + # job-name: ${{ env.JOB_NAME }} + # - name: Download logs from S3 + # id: log-s3 + # if: ${{ !cancelled() }} + # run: | + # mkdir -p axlearn-output + # aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/summary.txt axlearn-output/ + # aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.log" + # aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.xml" + + + # passed_tests=$(grep -Eo 'PASSED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' ) + # failed_tests=$(grep -Eo 'FAILED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' ) + # skipped_tests=$(grep -Eo 'SKIPPED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' ) + # total_tests=$((failed_tests + passed_tests + skipped_tests)) + + # echo "Passed tests: $passed_tests" + # echo "Failed tests: $failed_tests" + # echo "Skipped tests: $skipped_tests" + # echo "Total tests: $total_tests" + # echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT + # echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT + # - name: Generate sitrep + # id: sitrep + # if: ${{ !cancelled() }} + # shell: bash -x -e {0} + # run: | + # # bring in utility functions + # source .github/workflows/scripts/to_json.sh + + # badge_label='Axlearn EKS Unit' + + # total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \ + # failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \ + # passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \ + # errors="0" \ + # summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \ + # badge_message="Passed $passed_tests out of $total_tests." \ + # badge_color="brightgreen" + # if [ "$failed_tests" -gt 0 ]; then + # badge_color="red" + # fi \ + + # to_json \ + # summary \ + # errors total_tests passed_tests failed_tests \ + # badge_label badge_color badge_message \ + # > sitrep.json + + # schemaVersion=1 \ + # label="${badge_label}" \ + # message="Passed $passed_tests out of $total_tests." \ + # color=$badge_color \ + # to_json schemaVersion label message color \ + # > badge-axlearn-test.json + + # - name: Upload artifacts + # if: ${{ !cancelled() }} + # uses: actions/upload-artifact@v4 + # with: + # name: "artifact-axlearn-test" + # path: | + # sitrep.json + # badge-axlearn-test.json + # axlearn-unittests.jsonl + # axlearn-output/* + + # test-axlearn-fuji-models-eks: + # needs: build-axlearn + # if: >- + # inputs.ARCHITECTURE == 'amd64' && + # ( + # inputs.MODE == 'full' || + # inputs.MODE == 'axlearn' + # ) + # runs-on: eks + # env: + # AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} + # JOB_NAME: axlearn-fuji-3b-${{ github.run_id }} + # steps: + # - name: Check out the repository + # uses: actions/checkout@v4 + # - name: Login to GitHub Container Registry + # uses: docker/login-action@v3 + # with: + # registry: ghcr.io + # username: ${{ github.repository_owner }} + # password: ${{ secrets.GITHUB_TOKEN }} + # - name: K8s GHCR store and delete token + # id: store-token + # uses: ./.github/actions/store-delete-k8s-ghcr + # - name: Configure axlearn test job + # run: | + # yq -i ea ' + # select(di == 0).metadata.name = strenv(JOB_NAME) + # | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) + # | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \ + # .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml + # git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml + + # - name: Submit & delete axlearn fuji model test + # uses: ./.github/actions/submit-delete-k8s-job + # with: + # job-config-file: ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml" + # job-name: ${{ env.JOB_NAME }} diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml index 8dbe3ac7b..3fc671451 100644 --- a/.github/workflows/_test_nccl.yaml +++ b/.github/workflows/_test_nccl.yaml @@ -14,11 +14,11 @@ permissions: packages: write # to upload container jobs: - nccl-test-gke: - uses: ./.github/workflows/_test_nccl_gke.yaml - with: - JAX_IMAGE: ${{ inputs.CONTAINER }} - secrets: inherit + # nccl-test-gke: + # uses: ./.github/workflows/_test_nccl_gke.yaml + # with: + # JAX_IMAGE: ${{ inputs.CONTAINER }} + # secrets: inherit build-mpi-operator-compatible-base: runs-on: [self-hosted, "amd64", "large"] diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 1a6f53ec4..2a84fe93d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -51,7 +51,8 @@ on: - t5x - run build rosetta - maxtext - run only the tests for maxtext - axlearn - run only the tests for axlearn - options: [full, jax, te, t5x, maxtext, axlearn] + - nccl - run only nccl tests + options: [full, jax, te, t5x, maxtext, axlearn, nccl] default: full concurrency: @@ -210,7 +211,7 @@ jobs: CUDA_IMAGE: ${{ needs.metadata.outputs.CUDA_IMAGE }} MANIFEST_ARTIFACT_NAME: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }} SOURCE_URLREFS: ${{ needs.bump-manifest.outputs.SOURCE_URLREFS }} - MODE: ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} + MODE: "nccl" #${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} secrets: inherit arm64: @@ -222,7 +223,7 @@ jobs: CUDA_IMAGE: ${{ needs.metadata.outputs.CUDA_IMAGE }} MANIFEST_ARTIFACT_NAME: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }} SOURCE_URLREFS: ${{ needs.bump-manifest.outputs.SOURCE_URLREFS }} - MODE: ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} + MODE: "nccl" #${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} secrets: inherit # Only merge if everything succeeds