From 2501d5282dc3ccd2379f43b75cc5b3289294b88f Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Fri, 25 Oct 2024 08:51:35 -0700 Subject: [PATCH 1/8] ADLR/megatron-lm!2262 - ci: Allow dry-run of publish --- .gitlab-ci.yml | 17 +++- .gitlab/stages/01.test.yml | 157 ++++++++++++++++++++++++++++++++-- .gitlab/stages/03.publish.yml | 110 ++++++++++++------------ 3 files changed, 217 insertions(+), 67 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d261ed34b8..1f01679099 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -19,6 +19,7 @@ workflow: FUNCTIONAL_TEST_SCOPE: mr FUNCTIONAL_TEST_CLUSTER_A100: "" FUNCTIONAL_TEST_CLUSTER_H100: "" + PUBLISH: "no" - if: $CI_MERGE_REQUEST_LABELS =~ /Run nightly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != "" variables: UNIT_TEST_REPEAT: 5 @@ -27,6 +28,7 @@ workflow: FUNCTIONAL_TEST_SCOPE: nightly FUNCTIONAL_TEST_CLUSTER_A100: "" FUNCTIONAL_TEST_CLUSTER_H100: "" + PUBLISH: "no" - if: $CI_MERGE_REQUEST_LABELS =~ /Run weekly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != "" variables: UNIT_TEST_REPEAT: 5 @@ -35,9 +37,11 @@ workflow: FUNCTIONAL_TEST_SCOPE: weekly FUNCTIONAL_TEST_CLUSTER_A100: "" FUNCTIONAL_TEST_CLUSTER_H100: "" + PUBLISH: "no" - if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != "" variables: FUNCTIONAL_TEST: "no" + PUBLISH: "no" - when: never auto_cancel: on_new_commit: interruptible @@ -45,19 +49,24 @@ workflow: stages: - test - functional_tests - - convergence_tests - publish default: interruptible: true variables: - UNIT_TEST_TIMEOUT: - value: "15" - description: Timeout (minutes) for Unit tests (all repeats) + UNIT_TEST: + value: "yes" + options: + - "yes" + - "no" + description: To run the funtional test suite UNIT_TEST_REPEAT: value: "1" description: "Number of repetitions" + UNIT_TEST_TIMEOUT: + value: "15" + description: Timeout (minutes) for Unit tests (all repeats) FUNCTIONAL_TEST: value: "yes" options: diff --git a/.gitlab/stages/01.test.yml b/.gitlab/stages/01.test.yml index 079f3695fb..ca55de7d84 100644 --- a/.gitlab/stages/01.test.yml +++ b/.gitlab/stages/01.test.yml @@ -1,6 +1,6 @@ .test_rules: rules: - - if: $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true" + - if: $UNIT_TEST == 'yes' && $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true" allow_failure: true when: on_success - when: on_success @@ -46,7 +46,7 @@ test:build_image: ADDITIONAL_PARAMS=() - if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then + if [[ "$CI_COMMIT_BRANCH" == "ci-rebuild-mcore-nemo-image" ]]; then ADDITIONAL_PARAMS+=("--pull") ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:main") fi @@ -118,10 +118,10 @@ test:build_image: paths: - coverage rules: - - if: $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true" && $UNIT_TEST_REPEAT != '0' + - if: $UNIT_TEST == 'yes' && $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true" && $UNIT_TEST_REPEAT != '0' allow_failure: true when: on_success - - if: $UNIT_TEST_REPEAT != '0' + - if: $UNIT_TEST == 'yes' && $UNIT_TEST_REPEAT != '0' when: on_success test:pyt(LTS)_mcore(latest): @@ -135,6 +135,8 @@ test:pyt(LTS)_mcore(0.9.0): variables: TAG: core_r0.9.0 IMAGE: ${CI_MCORE_LTS_IMAGE} + UNIT_TEST_REPEAT: 1 + UNIT_TEST_TIMEOUT: 15 test:pyt(DEV)_mcore(latest): extends: [.unit_tests] @@ -147,8 +149,10 @@ test:pyt(DEV)_mcore(0.9.0): variables: TAG: core_r0.9.0 IMAGE: ${CI_MCORE_DEV_IMAGE} + UNIT_TEST_REPEAT: 1 + UNIT_TEST_TIMEOUT: 15 -test:notify: +test:notify_unit_tests: extends: [.test_rules] image: ${CI_MCORE_LTS_IMAGE}:${CI_PIPELINE_ID} needs: @@ -229,4 +233,145 @@ test:secret_detection: echo "Atleast one vulnerability has been found" cat gl-secret-detection-report.json | jq '.' exit 1 - fi \ No newline at end of file + fi + +test:pypi_build_wheel: + extends: [.test_rules] + image: + name: quay.io/pypa/manylinux_2_28_x86_64 + entrypoint: [""] + tags: [mcore-docker-node-small] + variables: + PUBLISH_DRYRUN: "yes" + script: + - echo $PUBLISH_DRYRUN + - > + if [ "$PUBLISH_DRYRUN" = "yes" ]; then + sed -i "/^PATCH/c\PATCH = $((RANDOM % 9000 + 1000))" megatron/core/package_info.py + fi + - /opt/python/cp310-cp310/bin/python -m build + - /opt/python/cp311-cp311/bin/python -m build + - auditwheel repair dist/*.whl + artifacts: + paths: + - megatron/core/package_info.py + - wheelhouse/ + +test:pypi_test_wheel: + extends: [.test_rules] + image: nvcr.io/nvidia/pytorch:24.01-py3 + needs: [test:pypi_build_wheel] + tags: [mcore-docker-node-small] + variables: + PUBLISH_DRYRUN: "yes" + script: + - EXPECTED_RELEASE_NUMBER=$(python -c "from megatron import core; print(core.__version__)") + - rm -rf megatron + - pip install wheelhouse/*cp310*.whl + + - RELEASE_NUMBER=$(python -c "from megatron import core; print(core.__version__)") + - > + echo "$EXPECTED_RELEASE_NUMBER" == "$RELEASE_NUMBER" + - test "$EXPECTED_RELEASE_NUMBER" == "$RELEASE_NUMBER" + artifacts: + paths: + - wheelhouse/ + +test:pypi_push_wheel: + extends: [.test_rules] + image: python:3.10 + tags: [mcore-docker-node-small] + needs: [test:pypi_test_wheel] + variables: + PUBLISH_DRYRUN: "yes" + script: + - > + if [ "$PUBLISH_DRYRUN" = "yes" ]; then + REPOSITORY=testpypi + export TWINE_USERNAME=$TWINE_TEST_USERNAME + export TWINE_PASSWORT=$TWINE_TEST_PASSWORD + else + REPOSITORY=pypi + export TWINE_USERNAME=$TWINE_PROD_USERNAME + export TWINE_PASSWORT=$TWINE_PROD_PASSWORD + fi + - pip install twine + - twine upload -u $TWINE_USERNAME -p $TWINE_PASSWORT --repository $REPOSITORY wheelhouse/* + +test:gh_release: + extends: [.test_rules] + tags: [mcore-docker-node-small] + image: nvcr.io/nvidia/pytorch:24.01-py3 + variables: + PUBLISH_DRYRUN: "yes" + script: + - RELEASE_NUMBER=$(python -c "from megatron import core; print(core.__version__)") + - NAME="NVIDIA Megatron Core $RELEASE_NUMBER" + - CHANGELOG=$(awk '/^## '"$NAME"'/{flag=1; next} /^## /{flag=0} flag' CHANGELOG.md) + - CHANGELOG=$(echo "$CHANGELOG" | sed '/./!d') + - > + PAYLOAD=$(jq -nc \ + --arg CI_COMMIT_BRANCH "$CI_COMMIT_BRANCH" \ + --arg NAME "$NAME" \ + --arg BODY "$CHANGELOG" \ + '{ + "tag_name": $CI_COMMIT_BRANCH, + "target_commitish": $CI_COMMIT_BRANCH, + "name": $NAME, + "body": $BODY, + "draft": false, + "prerelease": false, + "generate_release_notes": false + }' + ) + - > + CMD=$(echo curl -L \ + -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $GH_TOKEN" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/NVIDIA/Megatron-LM/releases \ + -d "$PAYLOAD" + ) + + if [[ "$PUBLISH_DRYRUN" == "yes" ]]; then + echo "$CMD" + else + eval "$CMD" + fi + +test:notify_release: + needs: [test:pypi_push_wheel, test:gh_release] + extends: [.test_rules] + image: nvcr.io/nvidia/pytorch:24.01-py3 + tags: [mcore-docker-node-small] + variables: + PUBLISH_DRYRUN: "yes" + script: + - VERSION=$(python -c "from megatron import core; print(core.__version__)") + - URL="https://github.com/NVIDIA/Megatron-LM/releases/tag/core_r$VERSION" + - > + MESSAGE='{ + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "Releasebot 🤖: Megatron-Core released <'$URL'|core_r'$VERSION'> 🚀" + } + } + ] + }' + - echo "$MESSAGE" + - > + CMD=$(echo curl \ + -X POST \ + -H "Content-type: application/json" \ + --data "$MESSAGE" ${MCORE_NOTIFICATION_HOOK_MAIN} + ) + + if [[ "$PUBLISH_DRYRUN" == "yes" ]]; then + echo "$CMD" + else + eval "$CMD" + fi diff --git a/.gitlab/stages/03.publish.yml b/.gitlab/stages/03.publish.yml index e1ee94bd19..4639d7690f 100644 --- a/.gitlab/stages/03.publish.yml +++ b/.gitlab/stages/03.publish.yml @@ -1,24 +1,28 @@ .publish_common_freeze: - stage: functional_tests + stage: publish rules: - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH && $PUBLISH == "yes" && $PUBLISH_SCOPE == "code-freeze" when: manual - when: never .publish_common_release: - stage: functional_tests + stage: publish rules: - if: $CI_COMMIT_BRANCH =~ /^core_r/ && $PUBLISH == "yes" && $PUBLISH_SCOPE == "release" when: manual + - if: $PUBLISH == "yes" && $PUBLISH_SCOPE == "release" + when: manual + variables: + PUBLISH_DRYRUN: "yes" - when: never -create-release-branch: +publish:release_branch: extends: [.publish_common_freeze] image: ${CI_MCORE_LTS_IMAGE}:${CI_PIPELINE_ID} needs: [test:build_image] tags: [mcore-docker-node-small] variables: - GIT_STRATEGY: "clone" + GIT_STRATEGY: "none" script: - git fetch origin $CI_DEFAULT_BRANCH - git config --global user.email "mcore-bot@nvidia.com" @@ -26,8 +30,8 @@ create-release-branch: - git remote set-url origin "https://gitlab-ci-token:${PAT}@${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git" - sed -i "/^PRE_RELEASE/c\PRE_RELEASE = ''" megatron/core/package_info.py - VERSION=$(python -c "from megatron import core; print(core.__version__)") - - git switch --force-create core_r$VERSION origin/$CI_DEFAULT_BRANCH - - git push -u origin core_r$VERSION --force + - RELEASE_BRANCH=core_r$VERSION + - git switch --force-create $RELEASE_BRANCH origin/$CI_DEFAULT_BRANCH - | MESSAGE='{ "blocks": [ @@ -35,61 +39,53 @@ create-release-branch: "type": "section", "text": { "type": "mrkdwn", - "text": "Releasebot 🤖: Megatron Core has been frozen 🎉 to branch `core_r$VERSION`" + "text": "Releasebot 🤖: Megatron Core has been frozen 🎉 to branch `'"$RELEASE_BRANCH"'`" } } ] }' - + - > curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${MCORE_NOTIFICATION_HOOK_MAIN} + - git switch --force-create bot/chore/bump-version + - git add megatron/core/package_info.py + - > + git commit -m "chore: adjust version version" + - git push -u origin bot/chore/bump-version + - > + curl \ + --header "PRIVATE-TOKEN: $PAT" \ + --url https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests \ + -d "source_branch=bot/chore/bump-version" \ + -d "target_branch=$RELEASE_BRANCH" \ + -d "title=chore: Fix version of \`$RELEASE_BRANCH\`" \ + -d "description=[🤖]: Hi @okoenig 👋,

we've adjusted the version number of \`$RELEASE_BRANCH\` for you! 🚀

Please review and approve this cherry pick by your convenience\!" -publish-wheel: - extends: [.publish_common_release] - image: quay.io/pypa/manylinux_2_28_x86_64 - tags: [mcore-docker-node-small] - script: - - export TWINE_USERNAME - - export TWINE_PASSWORT - - /opt/python/cp311-cp311/bin/pip install twine - - /opt/python/cp310-cp310/bin/python -m build - - /opt/python/cp311-cp311/bin/python -m build - - auditwheel repair dist/*.whl - - twine upload --repository pypi wheelhouse/* - -create-gh-release: - extends: [.publish_common_release] - tags: [mcore-docker-node-small] - image: - name: registry.gitlab.com/gitlab-ci-utils/curl-jq - entrypoint: [""] - script: - - | - RELEASE_NUMBER=$(python -c "from megatron import core; print(core.__version__)") - NAME="NVIDIA Megatron Core $RELEASE_NUMBER" - CHANGELOG=$(awk '/^## '$NAME'/{flag=1; next} /^## /{flag=0} flag' CHANGELOG.md) - CHANGELOG=$(echo "$CHANGELOG" | sed '/./!d') - - PAYLOAD=$(jq \ - -n \ - -c \ - --arg CI_COMMIT_BRANCH "$CI_COMMIT_BRANCH" \ - --arg NAME "$NAME" \ - --arg BODY "$CHANGELOG" \ - '{ - "tag_name": $CI_COMMIT_BRANCH, - "target_commitish": $CI_COMMIT_BRANCH, - "name": $NAME, - "body": $BODY, - "draft": false, - "prerelease": false, - "generate_release_notes": false - }' - ) +publish:pypi_build_wheel: + extends: [test:pypi_build_wheel, .publish_common_release] + dependencies: [] + variables: + PUBLISH_DRYRUN: "no" + +publish:pypi_test_wheel: + extends: [test:pypi_test_wheel, .publish_common_release] + needs: [publish:pypi_build_wheel] + variables: + PUBLISH_DRYRUN: "no" + +publish:pypi_push_wheel: + extends: [test:pypi_push_wheel, .publish_common_release] + needs: [publish:pypi_test_wheel] + variables: + PUBLISH_DRYRUN: "no" - curl -L \ - -X POST \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: Bearer $GH_TOKEN" \ - -H "X-GitHub-Api-Version: 2022-11-28" \ - https://api.github.com/repos/NVIDIA/Megatron-LM/releases \ - -d $PAYLOAD \ No newline at end of file +publish:gh_release: + extends: [test:gh_release, .publish_common_release] + dependencies: [] + variables: + PUBLISH_DRYRUN: "no" + +publish:notify_release: + needs: [publish:pypi_push_wheel, publish:gh_release] + extends: [test:notify_release, .publish_common_release] + variables: + PUBLISH_DRYRUN: "no" \ No newline at end of file From 8bac43ac38b8e57828601fdd39e2b6bef6919108 Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Fri, 25 Oct 2024 09:24:02 -0700 Subject: [PATCH 2/8] ADLR/megatron-lm!2265 - ci: Fix notifications --- tests/functional_tests/jet_recipes/gpt.yaml | 6 +++--- tests/functional_tests/jet_recipes/t5.yaml | 4 ++-- tests/functional_tests/shell_test_utils/notify.sh | 2 +- .../functional_tests/shell_test_utils/notify_unit_tests.sh | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/functional_tests/jet_recipes/gpt.yaml b/tests/functional_tests/jet_recipes/gpt.yaml index 04791f0ef2..196c3372c9 100644 --- a/tests/functional_tests/jet_recipes/gpt.yaml +++ b/tests/functional_tests/jet_recipes/gpt.yaml @@ -55,7 +55,6 @@ products: - gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G - gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G - gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G - gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G @@ -73,10 +72,9 @@ products: - gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G - gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G - gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G + # - gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G - gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G - gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G - - gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G - gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G - gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G - gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G @@ -139,11 +137,13 @@ products: platforms: [dgx_a100] time_limit: [1800] test_case: + - gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G - gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G - gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G - gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G + - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G - environment: [lts] scope: [weekly] platforms: [dgx_h100] diff --git a/tests/functional_tests/jet_recipes/t5.yaml b/tests/functional_tests/jet_recipes/t5.yaml index 85c28c9403..6635199025 100644 --- a/tests/functional_tests/jet_recipes/t5.yaml +++ b/tests/functional_tests/jet_recipes/t5.yaml @@ -36,14 +36,14 @@ products: - t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G - t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G - t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G - - t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G - t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G - t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G - - t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G - environment: [lts] scope: [mr] time_limit: [1800] test_case: + - t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G + - t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G - t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G - environment: [lts] scope: [weekly] diff --git a/tests/functional_tests/shell_test_utils/notify.sh b/tests/functional_tests/shell_test_utils/notify.sh index cbdc0e7030..4fa9d5deae 100644 --- a/tests/functional_tests/shell_test_utils/notify.sh +++ b/tests/functional_tests/shell_test_utils/notify.sh @@ -48,7 +48,7 @@ if [[ ${ret_code:-0} -ne 0 ]]; then fi # Fetch GitLab logs of JET downstream pipeline -DOWNSTREAM_PIPELINE_ID=$(jq --arg environment "$ENVIRONMENT" '.[] |select(.name == "jet-trigger-" + $environment) | .downstream_pipeline.id' <<< "$PIPELINE_JSON") +DOWNSTREAM_PIPELINE_ID=$(jq --arg environment "$ENVIRONMENT" '.[] |select(.name == "functional:run_" + $environment) | .downstream_pipeline.id' <<< "$PIPELINE_JSON") PIPELINE_URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/pipelines/$CI_PIPELINE_ID JOB_URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/jobs/ diff --git a/tests/functional_tests/shell_test_utils/notify_unit_tests.sh b/tests/functional_tests/shell_test_utils/notify_unit_tests.sh index 86cb29b772..e16f8d81f9 100644 --- a/tests/functional_tests/shell_test_utils/notify_unit_tests.sh +++ b/tests/functional_tests/shell_test_utils/notify_unit_tests.sh @@ -48,7 +48,7 @@ if [[ ${ret_code:-0} -ne 0 ]]; then exit 1 fi -UNIT_TESTS_JOBS=$(echo -E $PIPELINE_JSON | jq '[.[] | select(.name | startswith("test:unit_tests_"))]') +UNIT_TESTS_JOBS=$(echo -E $PIPELINE_JSON | jq '[.[] | select(.name | startswith("test:pyt"))]') if [[ $UNIT_TESTS_JOBS == null ]]; then FAILED_JOBS=$(curl \ From ef6cba6d0171907c637f840e0dff344fc70569b5 Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Sat, 26 Oct 2024 02:59:11 -0700 Subject: [PATCH 3/8] ADLR/megatron-lm!2266 - ci: Move REPEATS to launcher level --- .gitlab-ci.yml | 12 +++++ .gitlab/stages/02.functional-tests.yml | 4 ++ tests/functional_tests/jet_recipes/bert.yaml | 3 ++ .../jet_recipes/gpt-nemo.yaml | 2 + tests/functional_tests/jet_recipes/gpt.yaml | 4 ++ .../jet_recipes/multimodal-llava.yaml | 2 + tests/functional_tests/jet_recipes/t5.yaml | 4 ++ .../python_test_utils/jet/common.py | 4 ++ .../jet/generate_jet_trigger_job.py | 6 +++ .../jet/launch_jet_workload.py | 10 +++++ .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 5 +-- .../bert/bert_release/model_config.yaml | 44 ++++++++----------- .../common/ckpt_converter/model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../gpt/gpt3_15b_8t_release/model_config.yaml | 15 +------ .../gpt3_15b_8t_release_sm/model_config.yaml | 15 +------ .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 3 +- .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 2 +- .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 14 ------ .../model_config.yaml | 16 +------ .../model_config.yaml | 16 +------ .../model_config.yaml | 14 ------ .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 1 - .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../model_config.yaml | 2 +- .../t5/t5_release/model_config.yaml | 33 ++++++-------- .../test_flattened_resharding.py | 1 + 156 files changed, 130 insertions(+), 303 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1f01679099..06334601b4 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -17,6 +17,8 @@ workflow: UNIT_TEST_TIMEOUT: 75 FUNCTIONAL_TEST: "yes" FUNCTIONAL_TEST_SCOPE: mr + FUNCTIONAL_TEST_REPEAT: 5 + FUNCTIONAL_TEST_TIME_LIMIT: 1800, FUNCTIONAL_TEST_CLUSTER_A100: "" FUNCTIONAL_TEST_CLUSTER_H100: "" PUBLISH: "no" @@ -26,6 +28,8 @@ workflow: UNIT_TEST_TIMEOUT: 75 FUNCTIONAL_TEST: "yes" FUNCTIONAL_TEST_SCOPE: nightly + FUNCTIONAL_TEST_REPEAT: 5 + FUNCTIONAL_TEST_TIME_LIMIT: 1800, FUNCTIONAL_TEST_CLUSTER_A100: "" FUNCTIONAL_TEST_CLUSTER_H100: "" PUBLISH: "no" @@ -35,6 +39,8 @@ workflow: UNIT_TEST_TIMEOUT: 75 FUNCTIONAL_TEST: "yes" FUNCTIONAL_TEST_SCOPE: weekly + FUNCTIONAL_TEST_REPEAT: 1, + FUNCTIONAL_TEST_TIME_LIMIT: 9000, FUNCTIONAL_TEST_CLUSTER_A100: "" FUNCTIONAL_TEST_CLUSTER_H100: "" PUBLISH: "no" @@ -82,6 +88,12 @@ variables: - "pre-release" - "release" description: "Testsuite to run (only for FUNCTIONAL_TEST=yes)" + FUNCTIONAL_TEST_REPEAT: + value: "5" + description: "Number of repetitions per test" + FUNCTIONAL_TEST_TIME_LIMIT: + value: "1800" + description: "Timeout in seconds per test" FUNCTIONAL_TEST_CLUSTER_A100: value: "dgxa100_dracooci" options: diff --git a/.gitlab/stages/02.functional-tests.yml b/.gitlab/stages/02.functional-tests.yml index 07f4966734..db49c99c60 100644 --- a/.gitlab/stages/02.functional-tests.yml +++ b/.gitlab/stages/02.functional-tests.yml @@ -57,6 +57,8 @@ functional:configure: python tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py \ --scope $FUNCTIONAL_TEST_SCOPE \ --environment dev \ + --n-repeat "$FUNCTIONAL_TEST_REPEAT" \ + --time-limit "$FUNCTIONAL_TEST_TIME_LIMIT" \ --a100-cluster $A100_CLUSTER \ --h100-cluster $H100_CLUSTER \ --container-image ${CI_MCORE_LTS_IMAGE} \ @@ -68,6 +70,8 @@ functional:configure: python tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py \ --scope $FUNCTIONAL_TEST_SCOPE \ --environment lts \ + --n-repeat "$FUNCTIONAL_TEST_REPEAT" \ + --time-limit "$FUNCTIONAL_TEST_TIME_LIMIT" \ --a100-cluster $A100_CLUSTER \ --h100-cluster $H100_CLUSTER \ --container-image ${CI_MCORE_LTS_IMAGE} \ diff --git a/tests/functional_tests/jet_recipes/bert.yaml b/tests/functional_tests/jet_recipes/bert.yaml index 30349d708d..89a097641e 100644 --- a/tests/functional_tests/jet_recipes/bert.yaml +++ b/tests/functional_tests/jet_recipes/bert.yaml @@ -24,6 +24,7 @@ spec: "TRAINING_SCRIPT_PATH=pretrain_bert.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json" + "N_REPEAT={n_repeat}" ) bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}} @@ -32,6 +33,7 @@ products: - environment: [lts, dev] scope: [mr] time_limit: [1800] + n_repeat: [5] test_case: - bert_mr_mcore_tp2_pp2_dgx_a100_1N8G # - bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G @@ -43,6 +45,7 @@ products: - bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G - environment: [lts] scope: [nightly] + n_repeat: [5] time_limit: [3600] test_case: - bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2 diff --git a/tests/functional_tests/jet_recipes/gpt-nemo.yaml b/tests/functional_tests/jet_recipes/gpt-nemo.yaml index 366cae1f21..01e79b4793 100644 --- a/tests/functional_tests/jet_recipes/gpt-nemo.yaml +++ b/tests/functional_tests/jet_recipes/gpt-nemo.yaml @@ -24,6 +24,7 @@ spec: "TRAINING_SCRIPT_PATH=/opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py" "TRAINING_PARAMS_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json" + "N_REPEAT={n_repeat}" ) bash /opt/megatron-lm/tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}} @@ -31,6 +32,7 @@ spec: products: - environment: [dev] scope: [mr] + n_repeat: [5] test_case: - gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G - gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G diff --git a/tests/functional_tests/jet_recipes/gpt.yaml b/tests/functional_tests/jet_recipes/gpt.yaml index 196c3372c9..32ee90109b 100644 --- a/tests/functional_tests/jet_recipes/gpt.yaml +++ b/tests/functional_tests/jet_recipes/gpt.yaml @@ -23,6 +23,7 @@ spec: "TRAINING_SCRIPT_PATH=pretrain_gpt.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json" + "N_REPEAT={n_repeat}" ) bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}} @@ -32,6 +33,7 @@ products: scope: [mr] platforms: [dgx_a100] time_limit: [1800] + n_repeat: [5] test_case: - gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G - gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G @@ -103,6 +105,7 @@ products: scope: [nightly] platforms: [dgx_a100] time_limit: [3600] + n_repeat: [5] test_case: - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather @@ -136,6 +139,7 @@ products: scope: [mr] platforms: [dgx_a100] time_limit: [1800] + n_repeat: [5] test_case: - gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G diff --git a/tests/functional_tests/jet_recipes/multimodal-llava.yaml b/tests/functional_tests/jet_recipes/multimodal-llava.yaml index 981404db64..a6202e4910 100644 --- a/tests/functional_tests/jet_recipes/multimodal-llava.yaml +++ b/tests/functional_tests/jet_recipes/multimodal-llava.yaml @@ -24,6 +24,7 @@ spec: "TRAINING_SCRIPT_PATH=pretrain_vlm.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json" + "N_REPEAT={n_repeat}" ) bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}} @@ -31,6 +32,7 @@ spec: products: - environment: [lts, dev] scope: [mr] + n_repeat: [5] test_case: - multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G - multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G diff --git a/tests/functional_tests/jet_recipes/t5.yaml b/tests/functional_tests/jet_recipes/t5.yaml index 6635199025..eb76892661 100644 --- a/tests/functional_tests/jet_recipes/t5.yaml +++ b/tests/functional_tests/jet_recipes/t5.yaml @@ -24,6 +24,7 @@ spec: "TRAINING_SCRIPT_PATH=pretrain_t5.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json" + "N_REPEAT={n_repeat}" ) bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}} @@ -32,6 +33,7 @@ products: - environment: [lts, dev] scope: [mr] time_limit: [1800] + n_repeat: [5] test_case: - t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G - t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G @@ -41,6 +43,7 @@ products: - environment: [lts] scope: [mr] time_limit: [1800] + n_repeat: [5] test_case: - t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G - t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G @@ -48,6 +51,7 @@ products: - environment: [lts] scope: [weekly] time_limit: [9000] + n_repeat: [1] test_case: - t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch - t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1 diff --git a/tests/functional_tests/python_test_utils/jet/common.py b/tests/functional_tests/python_test_utils/jet/common.py index eed22752c6..9313e0a59c 100644 --- a/tests/functional_tests/python_test_utils/jet/common.py +++ b/tests/functional_tests/python_test_utils/jet/common.py @@ -134,6 +134,8 @@ def filter_by_model( def load_workloads( container_tag: str, + n_repeat: int = 1, + time_limit: int = 1800, environment: Optional[str] = None, scope: Optional[str] = None, model: Optional[str] = None, @@ -171,4 +173,6 @@ def load_workloads( container_image = container_image or build_workload.spec.source.image build_workload.spec.source.image = f"{container_image}:{container_tag}" workloads.append(build_workload) + workload.spec.n_repeat = n_repeat + workload.spec.time_limit = time_limit return workloads diff --git a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py index 3922de3f86..670072fc86 100644 --- a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py +++ b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py @@ -12,6 +12,8 @@ @click.command() @click.option("--scope", required=True, type=str, help="Test scope") @click.option("--environment", required=True, type=str, help="LTS or dev features") +@click.option("--n-repeat", required=False, default=1, type=int) +@click.option("--time-limit", required=False, default=1, type=int) @click.option("--a100-cluster", required=True, type=str, help="A100 Cluster to run on") @click.option("--h100-cluster", required=True, type=str, help="H100 Cluster to run on") @click.option("--output-path", required=True, type=str, help="Path to write GitLab job to") @@ -29,6 +31,8 @@ def main( scope: str, environment: str, + n_repeat: int, + time_limit: int, a100_cluster: str, h100_cluster: str, output_path: str, @@ -63,6 +67,8 @@ def main( "python tests/functional_tests/python_test_utils/jet/launch_jet_workload.py", f"--model {test_case.spec.model}", f"--environment {test_case.spec.environment}", + f"--n-repeat {n_repeat}", + f"--time-limit {time_limit}", f"--test-case {test_case.spec.test_case}", f"--container-tag {container_tag}", f"--cluster {cluster}", diff --git a/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py b/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py index 5ec4e84ae1..0418dd3937 100644 --- a/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py +++ b/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py @@ -42,6 +42,8 @@ def sigterm_handler(_signo, _stack_frame): def launch_and_wait_for_completion( test_case: str, environment: str, + n_repeat: int, + time_limit: int, container_image: str, container_tag: str, cluster: str, @@ -54,6 +56,8 @@ def launch_and_wait_for_completion( ).workloads.submit( workloads=common.load_workloads( test_case=test_case, + n_repeat=n_repeat, + time_limit=time_limit, container_image=container_image, container_tag=container_tag, environment=environment, @@ -142,6 +146,8 @@ def parse_finished_training(logs: List[str]) -> Optional[bool]: @click.option( "--environment", required=True, type=click.Choice(['dev', 'lts']), help="Pytorch LTS or DEV" ) +@click.option("--n-repeat", required=False, default=1, type=int) +@click.option("--time-limit", required=False, default=1800, type=int) @click.option( "--account", required=False, @@ -165,6 +171,8 @@ def main( model: str, test_case: str, environment: str, + n_repeat: int, + time_limit: int, account: str, cluster: str, container_tag: str, @@ -195,6 +203,8 @@ def main( pipeline = launch_and_wait_for_completion( test_case=test_case, environment=environment, + n_repeat=n_repeat, + time_limit=time_limit, container_image=container_image, container_tag=container_tag, cluster=cluster, diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml index 704fd1ce5a..d9268d02ec 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml @@ -5,7 +5,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml index eaf288d30d..207acb5aa4 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml @@ -5,7 +5,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml index 7072374fab..a8fb420757 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml @@ -5,7 +5,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml index f3afb10fd5..10fbeb700e 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml @@ -5,7 +5,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/model_config.yaml index 1e8f604797..991dfae683 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/model_config.yaml @@ -5,7 +5,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/model_config.yaml index 66ab6cabfd..cfc4827a2e 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/model_config.yaml @@ -5,7 +5,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml index 94d2f2feca..c3c70f8b0e 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml @@ -5,7 +5,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml index 2f6d24e945..9ffa49327d 100644 --- a/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml @@ -5,7 +5,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml index cb94c9c91b..73ad47092d 100644 --- a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml @@ -5,7 +5,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 @@ -43,4 +42,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --ckpt-format: torch -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/model_config.yaml index 3dd071d3de..29fa50cab2 100644 --- a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/model_config.yaml @@ -5,7 +5,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 @@ -44,4 +43,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --ckpt-format: torch -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml index 6d39266da3..d8fb0dc61f 100644 --- a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml @@ -5,7 +5,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 @@ -43,4 +42,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --ckpt-format: torch -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml index 989988f7cd..2d35954bf4 100644 --- a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml @@ -6,7 +6,6 @@ ENV_VARS: NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 NVTE_APPLY_QK_LAYER_SCALING: 1 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 @@ -46,4 +45,4 @@ MODEL_ARGS: --fp16: true --apply-query-key-layer-scaling: true --ckpt-format: torch -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml index edcf75a772..abc650a5e2 100644 --- a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml @@ -6,7 +6,6 @@ ENV_VARS: NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 NVTE_APPLY_QK_LAYER_SCALING: 1 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 @@ -43,7 +42,7 @@ MODEL_ARGS: --deterministic-mode: true --no-gradient-accumulation-fusion: true --data-cache-path: ${DATA_CACHE_PATH} - --fp16: true + --fp16: true --apply-query-key-layer-scaling: true --ckpt-format: torch -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml index 5c92fbf7da..b9de9dc01f 100644 --- a/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml @@ -3,52 +3,46 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: '1' NVTE_FLASH_ATTN: '0' NVTE_FUSED_ATTN: '0' - TEST_TYPE: 'release' - MODEL_ARGS: # Bert model args - --num-layers: 24 - --hidden-size: 1024 - --num-attention-heads: 16 - --seq-length: 512 - --max-position-embeddings: 512 - + --num-layers: 24 + --hidden-size: 1024 + --num-attention-heads: 16 + --seq-length: 512 + --max-position-embeddings: 512 # Training args - --micro-batch-size: 4 - --global-batch-size: 32 - --train-iters: 20000 - --weight-decay: 1e-2 - --clip-grad: 1.0 + --micro-batch-size: 4 + --global-batch-size: 32 + --train-iters: 20000 + --weight-decay: 1e-2 + --clip-grad: 1.0 --fp16: true --lr: 0.0001 - --lr-decay-style: linear - --min-lr: 1.0e-5 - --lr-warmup-fraction: .01 + --lr-decay-style: linear + --min-lr: 1.0e-5 + --lr-warmup-fraction: .01 --bert-no-binary-head: true - # Model parallel - --tensor-model-parallel-size: 8 - --pipeline-model-parallel-size: 8 - + --tensor-model-parallel-size: 8 + --pipeline-model-parallel-size: 8 # Data args --data-path: ${DATA_BLEND} - --vocab-file: ${DATA_PATH}/vocab.txt + --vocab-file: ${DATA_PATH}/vocab.txt --split: 949,50,1 --data-cache-path: ${DATA_CACHE_PATH} - # EVAL_AND_LOGGING_ARGS --log-interval: 100 --save-interval: 2000 - --eval-interval: 1000 + --eval-interval: 1000 --save: ${CHECKPOINT_PATH} --load: ${CHECKPOINT_PATH} --eval-iters: 10 - --tensorboard-dir: ${TENSORBOARD_PATH} + --tensorboard-dir: ${TENSORBOARD_PATH} --log-timers-to-tensorboard: true --log-memory-to-tensorboard: true --log-num-zeros-in-grad: true --log-params-norm: true --log-validation-ppl-to-tensorboard: true --wandb-project: megatron-core-release-runs - --wandb-exp-name: ${WANDB_EXPERIMENT} \ No newline at end of file + --wandb-exp-name: ${WANDB_EXPERIMENT} diff --git a/tests/functional_tests/test_cases/common/ckpt_converter/model_config.yaml b/tests/functional_tests/test_cases/common/ckpt_converter/model_config.yaml index bffa64bc52..2ac5db1147 100644 --- a/tests/functional_tests/test_cases/common/ckpt_converter/model_config.yaml +++ b/tests/functional_tests/test_cases/common/ckpt_converter/model_config.yaml @@ -3,6 +3,5 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G/model_config.yaml index 89c71f6291..51dbdfd67b 100644 --- a/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G/model_config.yaml @@ -1,7 +1,6 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 SKIP_PYTEST: 1 - N_REPEATS: 1 MODEL_ARGS: trainer.num_nodes: 1 trainer.devices: 8 diff --git a/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml index d7e926e96e..a48bfeae7f 100644 --- a/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml @@ -1,7 +1,6 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 SKIP_PYTEST: 1 - N_REPEATS: 1 MODEL_ARGS: trainer.num_nodes: 1 trainer.devices: 8 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml index bf88792152..89bc2ae8b6 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml @@ -6,9 +6,7 @@ ENV_VARS: NVTE_BWD_LAYERNORM_SM_MARGIN: 16 NCCL_P2P_NET_CHUNKSIZE: 2097152 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 - TEST_TYPE: "release" - MODEL_ARGS: # Distributed args --distributed-timeout-minutes: 60 @@ -17,7 +15,6 @@ MODEL_ARGS: --use-distributed-optimizer: true --overlap-grad-reduce: true --overlap-param-gather: true - # Training args --use-mcore-models: true --sequence-parallel: true @@ -27,10 +24,8 @@ MODEL_ARGS: --global-batch-size: 1152 --train-samples: 19531250 --manual-gc: true - # Transformer Engine args --transformer-impl: transformer_engine - # Data args --data-cache-path: ${DATA_CACHE_PATH} --tokenizer-type: GPTSentencePieceTokenizer @@ -39,7 +34,6 @@ MODEL_ARGS: --split: 99,1,0 --no-mmap-bin-files: true --num-workers: 6 - # Add network size args --apply-layernorm-1p: true --untie-embeddings-and-output-weights: true @@ -54,13 +48,11 @@ MODEL_ARGS: --num-query-groups: 8 --seq-length: 4096 --max-position-embeddings: 4096 - # Add regularization args --attention-dropout: 0.0 --hidden-dropout: 0.0 --clip-grad: 1.0 --weight-decay: 0.1 - # Add learning rate args --lr-decay-samples: 1949218748 --lr-warmup-samples: 3906252 @@ -71,19 +63,15 @@ MODEL_ARGS: --lr-decay-style: cosine --adam-beta1: 0.9 --adam-beta2: 0.95 - # Add validation args --eval-iters: 32 --eval-interval: 2000 - # Add checkpointing args --load: ${OUTPUT_PATH}/checkpoints --save: ${OUTPUT_PATH}/checkpoints --save-interval: 500 - # Add initialization args --init-method-std: 0.0134 - # Add logging args --log-timers-to-tensorboard: true --log-memory-to-tensorboard: true @@ -95,6 +83,5 @@ MODEL_ARGS: --tensorboard-dir: ${OUTPUT_PATH}/tensorboard --wandb-project: megatron-core-release-runs --wandb-exp-name: ${WANDB_EXPERIMENT} - # Add mixed precision args - --bf16: true \ No newline at end of file + --bf16: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml index 9453db100c..b279c96f05 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml @@ -6,9 +6,7 @@ ENV_VARS: NVTE_BWD_LAYERNORM_SM_MARGIN: 16 NCCL_P2P_NET_CHUNKSIZE: 2097152 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 - TEST_TYPE: "release" - MODEL_ARGS: # Distributed args --distributed-timeout-minutes: 60 @@ -17,7 +15,6 @@ MODEL_ARGS: --use-distributed-optimizer: true --overlap-grad-reduce: true --overlap-param-gather: true - # Training args --use-mcore-models: true --sequence-parallel: true @@ -27,10 +24,8 @@ MODEL_ARGS: --global-batch-size: 1152 --train-samples: 4882812 --manual-gc: true - # Transformer Engine args --transformer-impl: transformer_engine - # Data args --data-cache-path: ${DATA_CACHE_PATH} --tokenizer-type: GPTSentencePieceTokenizer @@ -39,7 +34,6 @@ MODEL_ARGS: --split: 99,1,0 --no-mmap-bin-files: true --num-workers: 6 - # Add network size args --apply-layernorm-1p: true --untie-embeddings-and-output-weights: true @@ -54,13 +48,11 @@ MODEL_ARGS: --num-query-groups: 8 --seq-length: 4096 --max-position-embeddings: 4096 - # Add regularization args --attention-dropout: 0.0 --hidden-dropout: 0.0 --clip-grad: 1.0 --weight-decay: 0.1 - # Add learning rate args --lr-decay-samples: 1949218748 --lr-warmup-samples: 3906252 @@ -71,19 +63,15 @@ MODEL_ARGS: --lr-decay-style: cosine --adam-beta1: 0.9 --adam-beta2: 0.95 - # Add validation args --eval-iters: 32 --eval-interval: 2000 - # Add checkpointing args --load: ${OUTPUT_PATH}/checkpoints --save: ${OUTPUT_PATH}/checkpoints --save-interval: 500 - # Add initialization args --init-method-std: 0.0134 - # Add logging args --log-timers-to-tensorboard: true --log-memory-to-tensorboard: true @@ -95,6 +83,5 @@ MODEL_ARGS: --tensorboard-dir: ${OUTPUT_PATH}/tensorboard --wandb-project: megatron-core-release-runs --wandb-exp-name: ${WANDB_EXPERIMENT} - # Add mixed precision args - --bf16: true \ No newline at end of file + --bf16: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml index 459270a1b2..69ad59f080 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -50,4 +49,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --apply-query-key-layer-scaling: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml index dcb80dc007..fd1e7253c9 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -51,4 +50,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml index d94f5277d4..2b94108731 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -47,4 +46,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --apply-query-key-layer-scaling: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml index 9f210d838f..d9ed9c7602 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -47,4 +46,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml index b943bfec0f..abb85baa55 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +47,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml index 108cb6b1a4..e40b6f61ee 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -47,4 +46,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --apply-query-key-layer-scaling: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml index 1c2a42eaaa..a2960f3a37 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +47,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml index cb0214f264..6beae45b8a 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -52,4 +51,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml index 97d3d8c5f0..d50c59d5f6 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -51,4 +50,4 @@ MODEL_ARGS: --ckpt-format: torch_dist --data-cache-path: ${DATA_CACHE_PATH} --bf16: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml index 1a15825731..2b01cfa62f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -52,4 +51,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts/model_config.yaml index c6728722e2..a74327d67f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -51,4 +50,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --apply-query-key-layer-scaling: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml index 37cc4615a5..267a290a59 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -47,4 +46,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --apply-query-key-layer-scaling: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml index 528b691a28..77c55fac92 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -47,4 +46,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml index 4f5e8d93b7..d5d4413669 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +47,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/model_config.yaml index 64d504bf29..7fac1317c4 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -49,4 +48,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/model_config.yaml index 190e5777f2..2c05343a10 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +47,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml index 99d0ac8f6b..2d4f4d2a15 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -47,4 +46,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/model_config.yaml index 6242b2ebbc..05eb509e6b 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +47,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/model_config.yaml index 81727e052d..4b1288dbe2 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -47,4 +46,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/model_config.yaml index 525d0f2c90..d55fb7510c 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +47,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/model_config.yaml index 516e1dd517..c0aceac272 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +47,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/model_config.yaml index 10fc8c2f23..c2439f9f36 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -49,4 +48,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts/model_config.yaml index ba219d4445..4c3a4fb095 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -51,4 +50,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/model_config.yaml index c547f47970..69dc9edf52 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +47,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/model_config.yaml index 72c98e80be..bd324b8ba1 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -52,4 +51,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/model_config.yaml index 03ddd8a7ca..e8723049fb 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -49,4 +48,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml index 84128fa780..226809ade0 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -47,4 +46,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/model_config.yaml index b664115f27..8746c03a36 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +47,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/model_config.yaml index 0ec5d88ad9..7d0be91444 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 @@ -48,4 +47,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --fp16: true --apply-query-key-layer-scaling: true -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml index ee84d93de2..c9de15222e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml index ffdaec80ad..90c257012f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml index 9dd9e9ecd0..fcaad99320 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml index 470ba6f926..1741647355 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml index fb07f9d30c..b51ada7c08 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml index 7cdb56dd00..2d2c1ce9a0 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml @@ -1,7 +1,6 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml index 7bdd0c46e2..7689c48dcc 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml index b014fdabc0..40f43682b7 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml index b2a1643ec8..ecc4c7fa76 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml index 6c2c9e51ab..65a87d67a1 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml index 2e0188551a..f3e4ce8a6f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml index 8fa10f4b9d..440638b53d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml index c64a4ef5e7..059716a6a3 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml index dda1876e1a..f82a51e4f3 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml index df7ba9fb3b..3d4dc222a4 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml index 479916c654..3e5acc65a0 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml index 20c57f0c95..9ae648b7bf 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml index f7c52c997f..85e8e81ff3 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml index 210febf448..fea891cd94 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml index fd67df60ca..b096c06b6c 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml index 0c0bc85f61..a2c641b31d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml index 7a92bfd8cd..2b9346ee7e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml index ef5b64d284..61adccbb97 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml index ca1de0ad37..023747a480 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml index 30137a040d..e573b90971 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml index 1513a18192..c31e5b66b3 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml index 077c9a36e8..9b02b473bd 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml index 1ccbe1ae31..d98716ac4d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml index b9ca819495..92b2e3528a 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml index 25ea6c933b..1f2fa9e2dc 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml index 7b7bc27f4b..49865dde85 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/model_config.yaml index 059265a079..bdb6ab3081 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/model_config.yaml @@ -49,4 +49,4 @@ MODEL_ARGS: --bf16: true --decoder-first-pipeline-num-layers: 2 --decoder-last-pipeline-num-layers: 2 -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml index 7da0cc5ddd..01c7ffc2f1 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml @@ -1,7 +1,6 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml index 476a1b6b93..2cc6bd5c6f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml @@ -1,7 +1,6 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml index 613559a96e..95f6e35591 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml index a1f86a64c7..edc9eed73d 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml index 6c454ecca7..b12ef70b9e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml index cf4a90e410..5246a6ecf1 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml index 793bfb21d4..46a56c1090 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml index 29b87e9073..3d4d717349 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml index c4b791a9d4..be3e678db6 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml index c2631e84e0..a2fb0f51af 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml index bc5da0c312..f3da93728f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml index 7c437e0b10..91e9e836c0 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml index dde8a620d3..5630ddd719 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml @@ -1,7 +1,6 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml index 303182bcaf..8f0bf337b9 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml @@ -1,7 +1,6 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml index c08ce2e01c..31544968ff 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml index 959c286a50..75a485403a 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml index c9938b5ee1..9b5deed4cb 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml index 23060e55e4..693a2d39f9 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml index 32bd642deb..3aa23b39a4 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml index 7d64cf477f..4a8a6abdd0 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml @@ -1,7 +1,6 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml index 6014052dd6..95f706d04a 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml @@ -1,7 +1,6 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml index 6d8a590974..e74a0cc992 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml index c304692d62..f041fd4ac7 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml index d8f1585ae2..e683475ffd 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml index c02d1fdc67..1b416d029a 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml index 7d5b13b753..4f922838b3 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml index cff824669b..bdb039ffda 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml @@ -1,7 +1,6 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml index 8846dacb40..b56afa8e52 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml index 9295cdc580..f482eda5e6 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml index b8f1667cdb..43224c5849 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml index d2888f767c..dda321f572 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml index 27acfbee86..93e1ce6463 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml index 1ea30bae73..6418b0c5d2 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml index f3348d608d..a5de201786 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml @@ -1,7 +1,6 @@ ENV_VARS: CUDA_DEVICE_MAX_CONNECTIONS: 1 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml index fbb767cb14..226dfbc6b6 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml index cf65df920f..168da23f9b 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml @@ -4,7 +4,6 @@ ENV_VARS: NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 SKIP_PYTEST: 1 - N_REPEATS: 1 BEFORE_SCRIPT: pip uninstall -y transformer_engine pip uninstall -y Apex ## TODO: remove once Apex dependency has been removed completely MODEL_ARGS: --num-layers: 12 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml index af105662a9..56d76fa39e 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml index 3d27f95aa6..52b0887e00 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml index 1e6b07a429..0923fd41f1 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G/model_config.yaml index 2ff5fc2224..9ea57cb3ac 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml index 4e4a963417..ea96682fe4 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml index 8d11e207e7..beaaa986ab 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 512 diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml index 9516076dc6..9f913d089f 100644 --- a/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml +++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml @@ -4,9 +4,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True NCCL_NVLS_ENABLE: 0 - TEST_TYPE: "release" - MODEL_ARGS: # Distributed args --distributed-timeout-minutes: 60 @@ -15,7 +13,6 @@ MODEL_ARGS: --use-distributed-optimizer: true --overlap-grad-reduce: true --overlap-param-gather: true - # Training args --use-mcore-models: true --sequence-parallel: true @@ -25,10 +22,8 @@ MODEL_ARGS: --global-batch-size: 256 --train-samples: 38400 --exit-duration-in-mins: 230 - # Transformer Engine args --transformer-impl: transformer_engine - # Data args --data-cache-path: ${DATA_CACHE_PATH} --tokenizer-type: Llama2Tokenizer @@ -37,7 +32,6 @@ MODEL_ARGS: --split: 99,1,0 --no-mmap-bin-files: true --num-workers: 6 - # Add network size args --untie-embeddings-and-output-weights: true --no-position-embedding: true @@ -54,13 +48,11 @@ MODEL_ARGS: --seq-length: 4096 --max-position-embeddings: 4096 --make-vocab-size-divisible-by: 128 - # Add regularization args --attention-dropout: 0.0 --hidden-dropout: 0.0 --clip-grad: 1.0 --weight-decay: 0.1 - # Add learning rate args --lr-decay-samples: 255126953 --lr-warmup-samples: 162761 @@ -69,7 +61,6 @@ MODEL_ARGS: --lr-decay-style: cosine --adam-beta1: 0.9 --adam-beta2: 0.95 - # Add MoE args --expert-model-parallel-size: 8 --num-experts: 8 @@ -78,11 +69,9 @@ MODEL_ARGS: --moe-grouped-gemm: true --moe-aux-loss-coeff: 1e-2 --moe-token-dispatcher-type: alltoall - # Add validation args --eval-iters: 32 --eval-interval: 500 - # Add checkpointing args --finetune: true --auto-detect-ckpt-format: true @@ -90,10 +79,8 @@ MODEL_ARGS: --save: ${OUTPUT_PATH}/checkpoints --no-ckpt-fully-parallel-save: true --save-interval: 500 - # Add initialization args --init-method-std: 0.008 - # Add logging args --log-timers-to-tensorboard: true --log-memory-to-tensorboard: true @@ -105,6 +92,5 @@ MODEL_ARGS: --tensorboard-dir: ${OUTPUT_PATH}/tensorboard --wandb-project: megatron-core-release-runs --wandb-exp-name: ${WANDB_EXPERIMENT} - # Add mixed precision args --bf16: true diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml index 585d9bb2c7..fa483b8770 100644 --- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml +++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml @@ -6,9 +6,7 @@ ENV_VARS: NVTE_BWD_LAYERNORM_SM_MARGIN: 16 NCCL_P2P_NET_CHUNKSIZE: 2097152 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 - TEST_TYPE: "release" - MODEL_ARGS: # Distributed args --distributed-timeout-minutes: 60 @@ -18,7 +16,6 @@ MODEL_ARGS: --overlap-grad-reduce: true --overlap-param-gather: true --no-ckpt-fully-parallel-save: true - # Training args --use-mcore-models: true --sequence-parallel: true @@ -28,19 +25,16 @@ MODEL_ARGS: --global-batch-size: 1024 --train-samples: 24414063 --exit-duration-in-mins: 230 - # Transformer Engine args --transformer-impl: transformer_engine - # Data args --data-cache-path: ${DATA_CACHE_PATH} --tokenizer-type: GPTSentencePieceTokenizer - --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model + --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model --data-path: $DATA_BLEND --split: 99,1,0 --no-mmap-bin-files: true --num-workers: 6 - # Add network size args --untie-embeddings-and-output-weights: true --no-position-embedding: true @@ -57,13 +51,11 @@ MODEL_ARGS: --seq-length: 4096 --max-position-embeddings: 4096 --make-vocab-size-divisible-by: 128 - # Add regularization args --attention-dropout: 0.0 --hidden-dropout: 0.0 --clip-grad: 1.0 --weight-decay: 0.1 - # Add learning rate args --lr-decay-samples: 1949218748 --lr-warmup-samples: 3906252 @@ -72,7 +64,6 @@ MODEL_ARGS: --lr-decay-style: cosine --adam-beta1: 0.9 --adam-beta2: 0.95 - # Add MoE args --expert-model-parallel-size: 4 --num-experts: 8 @@ -81,19 +72,15 @@ MODEL_ARGS: --moe-grouped-gemm: true --moe-aux-loss-coeff: 1e-2 --moe-token-dispatcher-type: alltoall - # Add validation args --eval-iters: 32 --eval-interval: 200 - # Add checkpointing args --load: ${OUTPUT_PATH}/checkpoints --save: ${OUTPUT_PATH}/checkpoints --save-interval: 500 - # Add initialization args --init-method-std: 0.010 - # Add logging args --log-timers-to-tensorboard: true --log-memory-to-tensorboard: true @@ -105,6 +92,5 @@ MODEL_ARGS: --tensorboard-dir: ${OUTPUT_PATH}/tensorboard --wandb-project: megatron-core-release-runs --wandb-exp-name: ${WANDB_EXPERIMENT} - # Add mixed precision args --bf16: true diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml index 22607416a3..969e9f17e6 100644 --- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml +++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml @@ -6,9 +6,7 @@ ENV_VARS: NVTE_BWD_LAYERNORM_SM_MARGIN: 16 NCCL_P2P_NET_CHUNKSIZE: 2097152 NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 - TEST_TYPE: "release" - MODEL_ARGS: # Distributed args --distributed-timeout-minutes: 60 @@ -18,7 +16,6 @@ MODEL_ARGS: --overlap-grad-reduce: true --overlap-param-gather: true --no-ckpt-fully-parallel-save: true - # Training args --use-mcore-models: true --sequence-parallel: true @@ -28,19 +25,16 @@ MODEL_ARGS: --global-batch-size: 1024 --train-samples: 6103515 --exit-duration-in-mins: 230 - # Transformer Engine args --transformer-impl: transformer_engine - # Data args --data-cache-path: ${DATA_CACHE_PATH} --tokenizer-type: GPTSentencePieceTokenizer - --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model + --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model --data-path: $DATA_BLEND --split: 99,1,0 --no-mmap-bin-files: true --num-workers: 6 - # Add network size args --untie-embeddings-and-output-weights: true --no-position-embedding: true @@ -57,13 +51,11 @@ MODEL_ARGS: --seq-length: 4096 --max-position-embeddings: 4096 --make-vocab-size-divisible-by: 128 - # Add regularization args --attention-dropout: 0.0 --hidden-dropout: 0.0 --clip-grad: 1.0 --weight-decay: 0.1 - # Add learning rate args --lr-decay-samples: 1949218748 --lr-warmup-samples: 3906252 @@ -72,7 +64,6 @@ MODEL_ARGS: --lr-decay-style: cosine --adam-beta1: 0.9 --adam-beta2: 0.95 - # Add MoE args --expert-model-parallel-size: 4 --num-experts: 8 @@ -81,19 +72,15 @@ MODEL_ARGS: --moe-grouped-gemm: true --moe-aux-loss-coeff: 1e-2 --moe-token-dispatcher-type: alltoall - # Add validation args --eval-iters: 32 --eval-interval: 200 - # Add checkpointing args --load: ${OUTPUT_PATH}/checkpoints --save: ${OUTPUT_PATH}/checkpoints --save-interval: 500 - # Add initialization args --init-method-std: 0.010 - # Add logging args --log-timers-to-tensorboard: true --log-memory-to-tensorboard: true @@ -105,6 +92,5 @@ MODEL_ARGS: --tensorboard-dir: ${OUTPUT_PATH}/tensorboard --wandb-project: megatron-core-release-runs --wandb-exp-name: ${WANDB_EXPERIMENT} - # Add mixed precision args --bf16: true diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml index 39421a887e..33593ffca7 100644 --- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml +++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml @@ -4,9 +4,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1 PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True NCCL_NVLS_ENABLE: 0 - TEST_TYPE: "release" - MODEL_ARGS: # Distributed args --distributed-timeout-minutes: 60 @@ -16,7 +14,6 @@ MODEL_ARGS: --use-distributed-optimizer: true --overlap-grad-reduce: true --overlap-param-gather: true - # Training args --use-mcore-models: true --sequence-parallel: true @@ -26,10 +23,8 @@ MODEL_ARGS: --global-batch-size: 256 --train-samples: 51200 --exit-duration-in-mins: 230 - # Transformer Engine args --transformer-impl: transformer_engine - # Data args --data-cache-path: ${DATA_CACHE_PATH} --tokenizer-type: Llama2Tokenizer @@ -38,7 +33,6 @@ MODEL_ARGS: --split: 99,1,0 --no-mmap-bin-files: true --num-workers: 6 - # Add network size args --untie-embeddings-and-output-weights: true --no-position-embedding: true @@ -55,13 +49,11 @@ MODEL_ARGS: --seq-length: 4096 --max-position-embeddings: 4096 --make-vocab-size-divisible-by: 128 - # Add regularization args --attention-dropout: 0.0 --hidden-dropout: 0.0 --clip-grad: 1.0 --weight-decay: 0.1 - # Add learning rate args --lr-decay-samples: 255126953 --lr-warmup-samples: 162761 @@ -70,7 +62,6 @@ MODEL_ARGS: --lr-decay-style: cosine --adam-beta1: 0.9 --adam-beta2: 0.95 - # Add MoE args --expert-model-parallel-size: 8 --num-experts: 8 @@ -79,11 +70,9 @@ MODEL_ARGS: --moe-grouped-gemm: true --moe-aux-loss-coeff: 1e-2 --moe-token-dispatcher-type: alltoall - # Add validation args --eval-iters: 32 --eval-interval: 200 - # Add checkpointing args --finetune: true --auto-detect-ckpt-format: true @@ -91,10 +80,8 @@ MODEL_ARGS: --save: ${OUTPUT_PATH}/checkpoints --no-ckpt-fully-parallel-save: true --save-interval: 500 - # Add initialization args --init-method-std: 0.008 - # Add logging args --log-timers-to-tensorboard: true --log-memory-to-tensorboard: true @@ -106,6 +93,5 @@ MODEL_ARGS: --tensorboard-dir: ${OUTPUT_PATH}/tensorboard --wandb-project: megatron-core-release-runs --wandb-exp-name: ${WANDB_EXPERIMENT} - # Add mixed precision args --bf16: true diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml index 6da0c3a85a..b3b81d5033 100644 --- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 624 diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/model_config.yaml index 816aa8bf1f..cdfdac5ffe 100644 --- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 624 diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/model_config.yaml b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/model_config.yaml index 180e6beedd..22f816cd89 100644 --- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/model_config.yaml +++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/model_config.yaml @@ -4,7 +4,6 @@ ENV_VARS: NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 GPUS_PER_NODE: 7 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 624 diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G/model_config.yaml b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G/model_config.yaml index 1fade8fd4e..4a829aca1d 100644 --- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G/model_config.yaml +++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G/model_config.yaml @@ -4,7 +4,6 @@ ENV_VARS: NCCL_ALGO: Tree CUBLAS_WORKSPACE_CONFIG: :4096:8 GPUS_PER_NODE: 7 - N_REPEATS: 5 MODEL_ARGS: --num-layers: 12 --hidden-size: 624 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml index 076389c3d6..e781e0980b 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: ^NVLS CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --encoder-num-layers: 12 --decoder-num-layers: 12 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml index b0d00b8f83..33daffa1e1 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: ^NVLS CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --encoder-num-layers: 12 --decoder-num-layers: 12 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml index d1b9e8429e..ac40afa88a 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: ^NVLS CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --encoder-num-layers: 12 --decoder-num-layers: 12 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml index 540d4c1b73..7a1690768a 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: ^NVLS CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --encoder-num-layers: 12 --decoder-num-layers: 12 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml index 8abace27d3..2df13fd07b 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: ^NVLS CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --encoder-num-layers: 12 --decoder-num-layers: 12 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml index c1a6d51bf1..23f9be2841 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: ^NVLS CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --encoder-num-layers: 12 --decoder-num-layers: 12 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml index 6aae44ca71..3f19d3a3f1 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: ^NVLS CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --encoder-num-layers: 12 --decoder-num-layers: 12 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml index 6e9731d4ce..243e1fc052 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml @@ -3,7 +3,6 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: ^NVLS CUBLAS_WORKSPACE_CONFIG: :4096:8 - N_REPEATS: 5 MODEL_ARGS: --encoder-num-layers: 12 --decoder-num-layers: 12 diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml index 6556baeb59..798f00c902 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml @@ -51,4 +51,4 @@ MODEL_ARGS: --deterministic-mode: true --attention-softmax-in-fp32: true --ckpt-format: torch -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/model_config.yaml index 70077b84a9..df56656bd6 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/model_config.yaml @@ -51,4 +51,4 @@ MODEL_ARGS: --deterministic-mode: true --attention-softmax-in-fp32: true --ckpt-format: torch -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml index 3a1793957b..940b85cfab 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml @@ -52,4 +52,4 @@ MODEL_ARGS: --deterministic-mode: true --attention-softmax-in-fp32: true --ckpt-format: torch -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/model_config.yaml index 233023af31..a05129f539 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/model_config.yaml @@ -50,4 +50,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --deterministic-mode: true --ckpt-format: torch -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/model_config.yaml index 43afd73364..91c6e2e220 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/model_config.yaml @@ -50,4 +50,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --deterministic-mode: true --ckpt-format: torch -TEST_TYPE: ckpt-resume \ No newline at end of file +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/model_config.yaml index 47ff5b038b..cf95759fc5 100644 --- a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/model_config.yaml @@ -50,4 +50,4 @@ MODEL_ARGS: --data-cache-path: ${DATA_CACHE_PATH} --deterministic-mode: true --ckpt-format: torch -TEST_TYPE: regular \ No newline at end of file +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml index 64784c36a6..5cc9a2e0d6 100644 --- a/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml @@ -3,44 +3,38 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: '1' NVTE_FLASH_ATTN: '0' NVTE_FUSED_ATTN: '0' - TEST_TYPE: 'release' - MODEL_ARGS: # T5 model args --encoder-num-layers: 12 --decoder-num-layers: 12 --hidden-size: 768 - --num-attention-heads: 12 + --num-attention-heads: 12 --kv-channels: 64 --ffn-hidden-size: 3072 --encoder-seq-length: 512 --decoder-seq-length: 128 - --max-position-embeddings: 512 + --max-position-embeddings: 512 --init-method-std: 0.015 - # Training args - --micro-batch-size: 32 - --global-batch-size: 512 - --train-iters: 100000 - --weight-decay: 1e-2 - --clip-grad: 1.0 + --micro-batch-size: 32 + --global-batch-size: 512 + --train-iters: 100000 + --weight-decay: 1e-2 + --clip-grad: 1.0 --bf16: true --lr: 0.0001 - --lr-decay-style: linear - --min-lr: 1.0e-5 - --lr-warmup-fraction: .01 + --lr-decay-style: linear + --min-lr: 1.0e-5 + --lr-warmup-fraction: .01 --distributed-backend: nccl - # Transformer Engine args --use-mcore-models: true --transformer-impl: transformer_engine - # Model parallel --tensor-model-parallel-size: 4 - --pipeline-model-parallel-size: 1 + --pipeline-model-parallel-size: 1 --encoder-pipeline-model-parallel-size: 0 - # Data args --data-path: ${DATA_BLEND} --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt @@ -48,11 +42,10 @@ MODEL_ARGS: --split: 99982,9,9 --data-cache-path: ${DATA_CACHE_PATH} --vocab-extra-ids: 100 - # EVAL_AND_LOGGING_ARGS --log-interval: 100 --save-interval: 2000 - --eval-interval: 1000 + --eval-interval: 1000 --save: ${CHECKPOINT_PATH} --load: ${CHECKPOINT_PATH} --eval-iters: 10 @@ -64,4 +57,4 @@ MODEL_ARGS: --log-validation-ppl-to-tensorboard: true --timing-log-level: 2 --wandb-project: megatron-core-release-runs - --wandb-exp-name: ${WANDB_EXPERIMENT} \ No newline at end of file + --wandb-exp-name: ${WANDB_EXPERIMENT} diff --git a/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py b/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py index fa00a20cad..be7621d7f4 100644 --- a/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py +++ b/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py @@ -33,6 +33,7 @@ def teardown_method(self, method): ('src_tp_pp', 'dest_tp_pp'), [((2, 4), (2, 4)), ((2, 4), (2, 2)), ((2, 4), (4, 2)), ((8, 1), (1, 2))], ) + @pytest.mark.flaky def test_partition_change_save_load(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp): Utils.initialize_model_parallel(*src_tp_pp) with TempNamedDir( From 6e05f339f2ebbcd7369f62dd963809ec880b5420 Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Sat, 26 Oct 2024 09:15:53 -0700 Subject: [PATCH 4/8] ADLR/megatron-lm!2268 - ci: Fix defaults --- .gitlab-ci.yml | 8 ++++---- tests/unit_tests/dist_checkpointing/test_async_save.py | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 06334601b4..a93e1cb615 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -18,7 +18,7 @@ workflow: FUNCTIONAL_TEST: "yes" FUNCTIONAL_TEST_SCOPE: mr FUNCTIONAL_TEST_REPEAT: 5 - FUNCTIONAL_TEST_TIME_LIMIT: 1800, + FUNCTIONAL_TEST_TIME_LIMIT: 1800 FUNCTIONAL_TEST_CLUSTER_A100: "" FUNCTIONAL_TEST_CLUSTER_H100: "" PUBLISH: "no" @@ -29,7 +29,7 @@ workflow: FUNCTIONAL_TEST: "yes" FUNCTIONAL_TEST_SCOPE: nightly FUNCTIONAL_TEST_REPEAT: 5 - FUNCTIONAL_TEST_TIME_LIMIT: 1800, + FUNCTIONAL_TEST_TIME_LIMIT: 1800 FUNCTIONAL_TEST_CLUSTER_A100: "" FUNCTIONAL_TEST_CLUSTER_H100: "" PUBLISH: "no" @@ -39,8 +39,8 @@ workflow: UNIT_TEST_TIMEOUT: 75 FUNCTIONAL_TEST: "yes" FUNCTIONAL_TEST_SCOPE: weekly - FUNCTIONAL_TEST_REPEAT: 1, - FUNCTIONAL_TEST_TIME_LIMIT: 9000, + FUNCTIONAL_TEST_REPEAT: 1 + FUNCTIONAL_TEST_TIME_LIMIT: 9000 FUNCTIONAL_TEST_CLUSTER_A100: "" FUNCTIONAL_TEST_CLUSTER_H100: "" PUBLISH: "no" diff --git a/tests/unit_tests/dist_checkpointing/test_async_save.py b/tests/unit_tests/dist_checkpointing/test_async_save.py index d6aa879982..d50aea30e2 100644 --- a/tests/unit_tests/dist_checkpointing/test_async_save.py +++ b/tests/unit_tests/dist_checkpointing/test_async_save.py @@ -71,6 +71,7 @@ def test_async_is_equivalent_to_sync(self, tmp_path_dist_ckpt): @pytest.mark.parametrize('async_save', [False, True]) @pytest.mark.parametrize('worker_fn', [write_data_os_err_mock_fn]) + @pytest.mark.flaky def test_errors_are_reported(self, tmp_path_dist_ckpt, async_save, worker_fn): Utils.initialize_model_parallel(2, 4) sharded_state_dict = { From d00cc116f53ded94c13485e2cd939a4105f28716 Mon Sep 17 00:00:00 2001 From: Anna Shors Date: Sat, 26 Oct 2024 19:29:18 -0700 Subject: [PATCH 5/8] ADLR/megatron-lm!2195 - Remove guard blocking distributed optimizer when TE/Apex are not installed --- megatron/core/optimizer/distrib_optimizer.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/megatron/core/optimizer/distrib_optimizer.py b/megatron/core/optimizer/distrib_optimizer.py index dfa8d51979..e814794f0b 100644 --- a/megatron/core/optimizer/distrib_optimizer.py +++ b/megatron/core/optimizer/distrib_optimizer.py @@ -18,7 +18,7 @@ try: from apex.optimizers import FusedAdam as Adam except ImportError: - from torch.optim import Adam + from torch.optim import AdamW as Adam HAVE_APEX_OR_TE = False @@ -462,10 +462,6 @@ def __init__( if has_config_logger_enabled(config): log_config_to_disk(config, locals(), prefix=type(self).__name__) - assert ( - HAVE_APEX_OR_TE - ), f'Please install Apex or Transformer Engine to use DistributedOptimizer.' - super().__init__(optimizer, config, grad_scaler, init_state_fn) self.model_chunks = model_chunks self.ddp_config = self.model_chunks[0].ddp_config From 5b2f5b08e917ef9741d12cb46fff7de46095c4bf Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Sat, 26 Oct 2024 19:29:20 -0700 Subject: [PATCH 6/8] ADLR/megatron-lm!2255 - ci: Improvements around functional triggering --- .gitlab-ci.yml | 3 + .gitlab/stages/02.functional-tests.yml | 2 + .../python_test_utils/jet/common.py | 48 ++++++-- .../jet/generate_jet_trigger_job.py | 108 +++++++++++------- 4 files changed, 110 insertions(+), 51 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a93e1cb615..83d432ea71 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -94,6 +94,9 @@ variables: FUNCTIONAL_TEST_TIME_LIMIT: value: "1800" description: "Timeout in seconds per test" + FUNCTIONAL_TEST_CASES: + value: "all" + description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite." FUNCTIONAL_TEST_CLUSTER_A100: value: "dgxa100_dracooci" options: diff --git a/.gitlab/stages/02.functional-tests.yml b/.gitlab/stages/02.functional-tests.yml index db49c99c60..99d6b4888a 100644 --- a/.gitlab/stages/02.functional-tests.yml +++ b/.gitlab/stages/02.functional-tests.yml @@ -59,6 +59,7 @@ functional:configure: --environment dev \ --n-repeat "$FUNCTIONAL_TEST_REPEAT" \ --time-limit "$FUNCTIONAL_TEST_TIME_LIMIT" \ + --test-cases $FUNCTIONAL_TEST_CASES \ --a100-cluster $A100_CLUSTER \ --h100-cluster $H100_CLUSTER \ --container-image ${CI_MCORE_LTS_IMAGE} \ @@ -72,6 +73,7 @@ functional:configure: --environment lts \ --n-repeat "$FUNCTIONAL_TEST_REPEAT" \ --time-limit "$FUNCTIONAL_TEST_TIME_LIMIT" \ + --test-cases $FUNCTIONAL_TEST_CASES \ --a100-cluster $A100_CLUSTER \ --h100-cluster $H100_CLUSTER \ --container-image ${CI_MCORE_LTS_IMAGE} \ diff --git a/tests/functional_tests/python_test_utils/jet/common.py b/tests/functional_tests/python_test_utils/jet/common.py index 9313e0a59c..301189e8e2 100644 --- a/tests/functional_tests/python_test_utils/jet/common.py +++ b/tests/functional_tests/python_test_utils/jet/common.py @@ -65,7 +65,7 @@ def load_and_flatten(config_path: str) -> List[jetclient.JETWorkloadManifest]: def filter_by_test_case( workload_manifests: List[jetclient.JETWorkloadManifest], test_case: str -) -> jetclient.JETWorkloadManifest: +) -> Optional[jetclient.JETWorkloadManifest]: """Returns a workload with matching name. Raises an error if there no or more than a single workload.""" workload_manifests = list( workload_manifest @@ -74,10 +74,12 @@ def filter_by_test_case( ) if len(workload_manifests) > 1: - raise ValueError("Duplicate test_case found!") + print("Duplicate test_case found!") + return if len(workload_manifests) == 0: - raise ValueError("No test_case found!") + print("No test_case found!") + return return workload_manifests[0] @@ -93,7 +95,8 @@ def filter_by_scope( ) if len(workload_manifests) == 0: - raise ValueError("No test_case found!") + print("No test_case found!") + return [] return workload_manifests @@ -111,7 +114,8 @@ def filter_by_environment( ) if len(workload_manifests) == 0: - raise ValueError("No test_case found!") + print("No test_case found!") + return [] return workload_manifests @@ -127,7 +131,26 @@ def filter_by_model( ) if len(workload_manifests) == 0: - raise ValueError("No test_case found!") + print("No test_case found!") + return [] + + return workload_manifests + + +def filter_by_test_cases( + workload_manifests: List[jetclient.JETWorkloadManifest], test_cases: str +) -> List[jetclient.JETWorkloadManifest]: + """Returns a workload with matching name. Raises an error if there no or more than a single workload.""" + workload_manifests = list( + workload_manifest + for workload_manifest in workload_manifests + for test_case in test_cases.split(",") + if workload_manifest.spec.test_case == test_case + ) + + if len(workload_manifests) == 0: + print("No test_case found!") + return [] return workload_manifests @@ -137,6 +160,7 @@ def load_workloads( n_repeat: int = 1, time_limit: int = 1800, environment: Optional[str] = None, + test_cases: str = "all", scope: Optional[str] = None, model: Optional[str] = None, test_case: Optional[str] = None, @@ -156,15 +180,21 @@ def load_workloads( if scope: workloads = filter_by_scope(workload_manifests=workloads, scope=scope) - if environment: + if workloads and environment: workloads = filter_by_environment(workload_manifests=workloads, environment=environment) - if model: + if workloads and model: workloads = filter_by_model(workload_manifests=workloads, model=model) - if test_case: + if workloads and test_cases != "all": + workloads = filter_by_test_cases(workload_manifests=workloads, test_cases=test_cases) + + if workloads and test_case: workloads = [filter_by_test_case(workload_manifests=workloads, test_case=test_case)] + if not workloads: + return [] + for workload in list(workloads): for build_workload in build_workloads: if ( diff --git a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py index 670072fc86..b21de4a22f 100644 --- a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py +++ b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py @@ -14,6 +14,9 @@ @click.option("--environment", required=True, type=str, help="LTS or dev features") @click.option("--n-repeat", required=False, default=1, type=int) @click.option("--time-limit", required=False, default=1, type=int) +@click.option( + "--test-cases", required=True, type=str, help="Comma-separated list of test_cases, or 'all'" +) @click.option("--a100-cluster", required=True, type=str, help="A100 Cluster to run on") @click.option("--h100-cluster", required=True, type=str, help="H100 Cluster to run on") @click.option("--output-path", required=True, type=str, help="Path to write GitLab job to") @@ -33,6 +36,7 @@ def main( environment: str, n_repeat: int, time_limit: int, + test_cases: str, a100_cluster: str, h100_cluster: str, output_path: str, @@ -44,56 +48,76 @@ def main( test_cases = [ test_case for test_case in common.load_workloads( - scope=scope, container_tag=container_tag, environment=environment + scope=scope, container_tag=container_tag, environment=environment, test_cases=test_cases ) if test_case.type != "build" ] - gitlab_pipeline = { - "stages": list(set([test_case.spec.model for test_case in test_cases])), - "default": {"interruptible": True}, - } + if not test_cases: + gitlab_pipeline = { + "stages": ["empty-pipeline-placeholder"], + "default": {"interruptible": True}, + "empty-pipeline-placeholder-job": { + "stage": "empty-pipeline-placeholder", + "image": f"{container_image}:{container_tag}", + "tags": ["mcore-docker-node-jet"], + "rules": [ + {"if": '$CI_PIPELINE_SOURCE == "parent_pipeline"'}, + {"if": '$CI_MERGE_REQUEST_ID'}, + ], + "timeout": "7 days", + "needs": [{"pipeline": '$PARENT_PIPELINE_ID', "job": "functional:configure"}], + "script": ["sleep 1"], + "artifacts": {"paths": ["results/"], "when": "always"}, + }, + } + + else: + gitlab_pipeline = { + "stages": list(set([test_case.spec.model for test_case in test_cases])), + "default": {"interruptible": True}, + } - for test_case in test_cases: - if test_case.spec.platforms == "dgx_a100": - cluster = a100_cluster - elif test_case.spec.platforms == "dgx_h100": - cluster = h100_cluster - else: - raise ValueError(f"Platform {test_case.spec.platforms} unknown") + for test_case in test_cases: + if test_case.spec.platforms == "dgx_a100": + cluster = a100_cluster + elif test_case.spec.platforms == "dgx_h100": + cluster = h100_cluster + else: + raise ValueError(f"Platform {test_case.spec.platforms} unknown") - script = [ - "export PYTHONPATH=$(pwd); " - "python tests/functional_tests/python_test_utils/jet/launch_jet_workload.py", - f"--model {test_case.spec.model}", - f"--environment {test_case.spec.environment}", - f"--n-repeat {n_repeat}", - f"--time-limit {time_limit}", - f"--test-case {test_case.spec.test_case}", - f"--container-tag {container_tag}", - f"--cluster {cluster}", - ] + script = [ + "export PYTHONPATH=$(pwd); " + "python tests/functional_tests/python_test_utils/jet/launch_jet_workload.py", + f"--model {test_case.spec.model}", + f"--environment {test_case.spec.environment}", + f"--n-repeat {n_repeat}", + f"--time-limit {time_limit}", + f"--test-case {test_case.spec.test_case}", + f"--container-tag {container_tag}", + f"--cluster {cluster}", + ] - if run_name is not None and wandb_experiment is not None: - script.append(f"--run-name {run_name}") - test_case.spec.model - script.append( - f"--wandb-experiment {wandb_experiment}-{test_case.spec.model}-{test_case.spec.test_case}" - ) + if run_name is not None and wandb_experiment is not None: + script.append(f"--run-name {run_name}") + test_case.spec.model + script.append( + f"--wandb-experiment {wandb_experiment}-{test_case.spec.model}-{test_case.spec.test_case}" + ) - gitlab_pipeline[test_case.spec.test_case] = { - "stage": f"{test_case.spec.model}", - "image": f"{container_image}:{container_tag}", - "tags": ["mcore-docker-node-jet"], - "rules": [ - {"if": '$CI_PIPELINE_SOURCE == "parent_pipeline"'}, - {"if": '$CI_MERGE_REQUEST_ID'}, - ], - "timeout": "7 days", - "needs": [{"pipeline": '$PARENT_PIPELINE_ID', "job": "functional:configure"}], - "script": [" ".join(script)], - "artifacts": {"paths": ["results/"], "when": "always"}, - } + gitlab_pipeline[test_case.spec.test_case] = { + "stage": f"{test_case.spec.model}", + "image": f"{container_image}:{container_tag}", + "tags": ["mcore-docker-node-jet"], + "rules": [ + {"if": '$CI_PIPELINE_SOURCE == "parent_pipeline"'}, + {"if": '$CI_MERGE_REQUEST_ID'}, + ], + "timeout": "7 days", + "needs": [{"pipeline": '$PARENT_PIPELINE_ID', "job": "functional:configure"}], + "script": [" ".join(script)], + "artifacts": {"paths": ["results/"], "when": "always"}, + } with open(output_path, 'w') as outfile: yaml.dump(gitlab_pipeline, outfile, default_flow_style=False) From 210162aebcfc68d72f39049d5cf84a83d3b11dea Mon Sep 17 00:00:00 2001 From: Xin Yao Date: Mon, 28 Oct 2024 03:56:56 -0700 Subject: [PATCH 7/8] ADLR/megatron-lm!2201 - Make RoPE work with packed sequence and CP and Miscellaneous fixes --- .../core/extensions/transformer_engine.py | 49 ++++++++------ .../models/common/embeddings/rope_utils.py | 65 +++++++++++++------ .../common/embeddings/rotary_pos_embedding.py | 21 ++++-- .../embeddings/yarn_rotary_pos_embedding.py | 10 +++ megatron/core/models/gpt/gpt_model.py | 7 +- megatron/core/transformer/attention.py | 10 ++- .../core/transformer/transformer_config.py | 17 ++++- megatron/training/arguments.py | 2 + .../model_config.yaml | 1 + .../model_config.yaml | 1 + 10 files changed, 131 insertions(+), 52 deletions(-) diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py index 47606af27d..a33082d6f0 100644 --- a/megatron/core/extensions/transformer_engine.py +++ b/megatron/core/extensions/transformer_engine.py @@ -655,11 +655,6 @@ def forward( packed_seq_kwargs = ( dataclasses.asdict(packed_seq_params) if packed_seq_params is not None else {} ) - # overwrite self.qkv_format depending on self.config.apply_rope_fusion, which can be set - # after init - if self.config.apply_rope_fusion and is_te_min_version("0.13.0", check_equality=False): - self.qkv_format = 'bshd' - qkv_format = packed_seq_kwargs.get('qkv_format', self.qkv_format) if get_te_version() < PkgVersion("1.3.0"): @@ -676,17 +671,6 @@ def forward( packed_seq_kwargs.pop("cu_seqlens_q_padded", None) packed_seq_kwargs.pop("cu_seqlens_kv_padded", None) - if self.config.apply_rope_fusion and qkv_format == 'bshd': - query, key, value = [x.transpose(0, 1).contiguous() for x in (query, key, value)] - # In PyTorch, the following two tensors are in fact the same: - # Tensor with shape (1, S, H, D) and stride (S*H*D, H*D, D, 1) - # Tensor with shape (1, S, H, D) and stride (H*D, H*D, D, 1) - # Stride for a dimension that is 1 has no meaning, so tensors created two different ways - # can have same shape but different strides. - # We unify them to the first one to pass the stride check in TE - if value.shape == key.shape and value.shape[0] == 1 and value.stride() != key.stride(): - value = value.as_strided(value.shape, key.stride()) - if self.te_forward_mask_type: if qkv_format == 'thd' and is_te_min_version("1.7.0"): # thd format uses flash attention with cuDNN kernel which requires is_padding=True, @@ -707,10 +691,7 @@ def forward( else: core_attn_out = super().forward(query, key, value, attention_mask, **packed_seq_kwargs) - if self.config.apply_rope_fusion and qkv_format == 'bshd': - return core_attn_out.transpose(0, 1) - else: - return core_attn_out + return core_attn_out if is_te_min_version("1.9.0.dev0"): @@ -1091,3 +1072,31 @@ def get_cpu_offload_context( except ImportError: get_cpu_offload_context = None + + +try: + + from transformer_engine.pytorch.attention import FusedRoPEFunc + + def fused_apply_rotary_pos_emb(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor: + """Apply rotary positional embedding to input tensor T in `sbhd` format.""" + return FusedRoPEFunc.apply(t, freqs, "sbhd") + + def fused_apply_rotary_pos_emb_thd( + t: torch.Tensor, + cu_seqlens: torch.Tensor, + freqs: torch.Tensor, + cp_size: int = 1, + cp_rank: int = 0, + ) -> torch.Tensor: + """ + Apply rotary positional embedding to input tensor T in `thd` format with CP support. + """ + if is_te_min_version("1.11.0", check_equality=False): + return FusedRoPEFunc.apply(t, freqs, "thd", cu_seqlens, cp_size, cp_rank) + else: + return FusedRoPEFunc.apply(t, freqs, "thd", cu_seqlens) + +except ImportError: + + pass diff --git a/megatron/core/models/common/embeddings/rope_utils.py b/megatron/core/models/common/embeddings/rope_utils.py index accb251961..fc7d355827 100644 --- a/megatron/core/models/common/embeddings/rope_utils.py +++ b/megatron/core/models/common/embeddings/rope_utils.py @@ -13,18 +13,27 @@ from torch import Tensor from megatron.core import parallel_state +from megatron.core.utils import is_te_min_version logger = logging.getLogger(__name__) try: - from apex.transformer.functional import ( + from megatron.core.extensions.transformer_engine import ( fused_apply_rotary_pos_emb, fused_apply_rotary_pos_emb_thd, ) HAVE_APPLY_ROPE_FUSION = True except ImportError: - HAVE_APPLY_ROPE_FUSION = False + try: + from apex.transformer.functional import ( + fused_apply_rotary_pos_emb, + fused_apply_rotary_pos_emb_thd, + ) + + HAVE_APPLY_ROPE_FUSION = True + except ImportError: + HAVE_APPLY_ROPE_FUSION = False def get_pos_emb_on_this_cp_rank(pos_emb: Tensor, seq_dim: int) -> Tensor: @@ -103,6 +112,20 @@ def _apply_rotary_pos_emb_bshd( return torch.cat((t, t_pass), dim=-1) +def _get_thd_freqs_on_this_cp_rank(cp_rank: int, cp_size: int, x: Tensor, freqs: Tensor) -> Tensor: + if cp_size > 1: + cp_seg = x.size(0) // 2 + full_seqlen = cp_size * x.size(0) + return torch.cat( + [ + freqs[cp_rank * cp_seg : (cp_rank + 1) * cp_seg], + freqs[full_seqlen - (cp_rank + 1) * cp_seg : full_seqlen - cp_rank * cp_seg], + ] + ) + else: + return freqs[: x.size(0)] + + def _apply_rotary_pos_emb_thd( t: Tensor, cu_seqlens: Tensor, @@ -123,12 +146,16 @@ def _apply_rotary_pos_emb_thd( Tensor: Shape [t, h, d]. The input tensor after applying RoPE. """ + cp_size = parallel_state.get_context_parallel_world_size() + cp_rank = parallel_state.get_context_parallel_rank() + cu_seqlens = cu_seqlens // cp_size seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + return torch.cat( [ _apply_rotary_pos_emb_bshd( x.unsqueeze(1), - freqs[: x.size(0)], + _get_thd_freqs_on_this_cp_rank(cp_rank, cp_size, x, freqs), rotary_interleaved=rotary_interleaved, multi_latent_attention=multi_latent_attention, mscale=mscale, @@ -149,28 +176,24 @@ def apply_rotary_pos_emb( Reroute to the appropriate apply_rotary_pos_emb function depending on fused/unfused kernels, or bshd (conventional) / thd (packed seq) format """ - if config.apply_rope_fusion and not HAVE_APPLY_ROPE_FUSION: - # setting apply_rope_fusion in config to False - # so that subsequent queries to this config also return False - config.apply_rope_fusion = False - if not getattr(apply_rotary_pos_emb, "printed_fused_warning", False): - logger.warning( - "Setting apply_rope_fusion to false because its implementation" - " is not included in Apex. Try upgrading to the latest version" - ) - apply_rotary_pos_emb.printed_fused_warning = True - - if getattr(config, "multi_latent_attention", False) and config.rotary_interleaved: - logger.warning( - "rotary_interleaved is not supported with multi_latent_attention, setting it to False" - ) - config.rotary_interleaved = False if config.apply_rope_fusion: if cu_seqlens is None: - return fused_apply_rotary_pos_emb(t, freqs, transpose_output_memory=True) + return fused_apply_rotary_pos_emb(t, freqs) else: - return fused_apply_rotary_pos_emb_thd(t, cu_seqlens, freqs) + cp_size = parallel_state.get_context_parallel_world_size() + if cp_size > 1: + if not is_te_min_version("1.11.0", check_equality=False): + raise ValueError("Only TE >= 1.12 supports RoPE fusion for THD format with CP.") + return fused_apply_rotary_pos_emb_thd( + t, + cu_seqlens, + freqs, + cp_size=cp_size, + cp_rank=parallel_state.get_context_parallel_rank(), + ) + else: + return fused_apply_rotary_pos_emb_thd(t, cu_seqlens, freqs) else: if cu_seqlens is None: return _apply_rotary_pos_emb_bshd( diff --git a/megatron/core/models/common/embeddings/rotary_pos_embedding.py b/megatron/core/models/common/embeddings/rotary_pos_embedding.py index 5232faec60..92c3efb379 100644 --- a/megatron/core/models/common/embeddings/rotary_pos_embedding.py +++ b/megatron/core/models/common/embeddings/rotary_pos_embedding.py @@ -7,9 +7,12 @@ if TYPE_CHECKING: from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.transformer_block import TransformerBlock + from megatron.core.inference_params import InferenceParams + from megatron.core.packed_seq_params import PackedSeqParams import logging import math +from functools import lru_cache import torch from torch import Tensor, nn @@ -109,12 +112,14 @@ def _apply_scaling( return inv_freq_llama - def forward(self, max_seq_len: int, offset: int = 0) -> Tensor: + @lru_cache(maxsize=32) + def forward(self, max_seq_len: int, offset: int = 0, packed_seq: bool = False) -> Tensor: """Forward pass of RoPE embedding. Args: max_seq_len (int): Maximum size of sequence - offset (int, optional): _description_. Defaults to 0. + offset (int, optional): RoPE offset. Defaults to 0. + packed_seq (bool, optional): Whether to use packed sequence. Defaults to False. Returns: Tensor: Embeddings after applying RoPE. @@ -141,7 +146,7 @@ def forward(self, max_seq_len: int, offset: int = 0) -> Tensor: ) # emb [seq_length, .., dim] emb = emb[:, None, None, :] - if parallel_state.get_context_parallel_world_size() > 1: + if parallel_state.get_context_parallel_world_size() > 1 and not packed_seq: # slice rotary_pos_emb along sequence dimension and select the parition of the current # CP rank emb = get_pos_emb_on_this_cp_rank(emb, 0) @@ -153,10 +158,11 @@ def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs): def get_rotary_seq_len( self, - inference_params, + inference_params: InferenceParams, transformer: TransformerBlock, transformer_input: Tensor, transformer_config: TransformerConfig, + packed_seq_params: PackedSeqParams, ) -> float: """Function to get the rotary sequence length. @@ -166,11 +172,16 @@ def get_rotary_seq_len( by the model transformer_input (Tensor): Input tensor to the transformer transformer_config (TransformerConfig): Transformer config used by the model + packed_seq_params (PackedSeqParams): Packed sequence params Returns: float: The rotary sequence length """ - if inference_params is not None: + if packed_seq_params is not None: + # max_seqlen are the max sequence length in the packed sequence before being divived + # by the tp and cp size. + return max(packed_seq_params.max_seqlen_q, packed_seq_params.max_seqlen_kv) + elif inference_params is not None: rotary_seq_len = inference_params.max_sequence_length else: if transformer.input_tensor is not None: diff --git a/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py index 14d147ea34..3ab155dcdb 100644 --- a/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +++ b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py @@ -4,6 +4,7 @@ import logging import math +from functools import lru_cache import torch from torch import Tensor @@ -82,8 +83,17 @@ def __init__( use_cpu_initialization, ) + @lru_cache(maxsize=32) def forward(self, max_seq_len: int, offset: int = 0) -> Tensor: + """Forward pass of Yarn Rotary Embedding. + Args: + max_seq_len (int): Maximum size of sequence + offset (int, optional): RoPE offset. Defaults to 0. + + Returns: + Tensor: Embeddings after applying Yarn RoPE. + """ assert ( not self.rotary_interleaved ), "Yarn RoPE does not support interleaved rotary embeddings" diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index bd52f89680..f7567621f6 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -218,9 +218,12 @@ def forward( rotary_pos_emb = None if self.position_embedding_type == 'rope' and not self.config.multi_latent_attention: rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len( - inference_params, self.decoder, decoder_input, self.config + inference_params, self.decoder, decoder_input, self.config, packed_seq_params + ) + rotary_pos_emb = self.rotary_pos_emb( + rotary_seq_len, + packed_seq=packed_seq_params is not None and packed_seq_params.qkv_format == 'thd', ) - rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len) # Run decoder. hidden_states = self.decoder( diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py index 31fd8553e0..32fab28b49 100644 --- a/megatron/core/transformer/attention.py +++ b/megatron/core/transformer/attention.py @@ -283,8 +283,14 @@ def forward( q_pos_emb, k_pos_emb = rotary_pos_emb if packed_seq_params is not None: - cu_seqlens_q = packed_seq_params.cu_seqlens_q - cu_seqlens_kv = packed_seq_params.cu_seqlens_kv + if packed_seq_params.cu_seqlens_q_padded is not None: + cu_seqlens_q = packed_seq_params.cu_seqlens_q_padded + else: + cu_seqlens_q = packed_seq_params.cu_seqlens_q + if packed_seq_params.cu_seqlens_kv_padded is not None: + cu_seqlens_kv = packed_seq_params.cu_seqlens_kv_padded + else: + cu_seqlens_kv = packed_seq_params.cu_seqlens_kv else: cu_seqlens_q = cu_seqlens_kv = None query = apply_rotary_pos_emb( diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index c67913e164..8b374ca4be 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -493,11 +493,24 @@ def __post_init__(self): "When bias_activation_fusion is True, gated_linear_unit is False, " "and activation function is gelu, add_bias_linear must also be True." ) + if self.activation_func_fp8_input_store: if self.activation_func != F.silu or not self.gated_linear_unit: raise ValueError("Storing activation input in FP8 is supported only for SwiGLU.") - if self.apply_rope_fusion and self.rotary_interleaved: - raise ValueError('rotary_interleaved does not work with apply_rope_fusion.') + + if self.apply_rope_fusion: + if self.rotary_interleaved: + raise ValueError("rotary_interleaved does not work with apply_rope_fusion.") + + from megatron.core.models.common.embeddings.rope_utils import HAVE_APPLY_ROPE_FUSION + + if not HAVE_APPLY_ROPE_FUSION: + raise ValueError( + "apply_rope_fusion is not available. Please install TE >= 1.4 or Apex." + ) + + if self.multi_latent_attention and self.rotary_interleaved: + raise ValueError("rotary_interleaved does not work with multi_latent_attention.") if self.init_method is None: self.init_method = init_method_normal(self.init_method_std) diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index e3d876a5f2..64c92ea3cd 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -547,6 +547,8 @@ def validate_args(args, defaults={}): raise RuntimeError('--rotary-interleaved does not work with rope_fusion.') if args.rotary_interleaved and args.use_legacy_models: raise RuntimeError('--rotary-interleaved is not supported in legacy models.') + if args.position_embedding_type != 'rope': + args.apply_rope_fusion = False # Would just need to add 'NoPE' as a position_embedding_type to support this, but for now # don't allow it to keep things simple diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml index 7bdd0c46e2..1649d326ec 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml @@ -41,6 +41,7 @@ MODEL_ARGS: --tensor-model-parallel-size: 1 --pipeline-model-parallel-size: 2 --position-embedding-type: rope + --no-rope-fusion: true --no-ckpt-fully-parallel-save: true --deterministic-mode: true --no-gradient-accumulation-fusion: true diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml index b2a1643ec8..6ca7dcf27f 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml @@ -41,6 +41,7 @@ MODEL_ARGS: --tensor-model-parallel-size: 1 --pipeline-model-parallel-size: 2 --position-embedding-type: rope + --no-rope-fusion: true --no-ckpt-fully-parallel-save: true --deterministic-mode: true --no-gradient-accumulation-fusion: true From aa6be133ac7530916501a7be4cc34c6dcc169694 Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Mon, 28 Oct 2024 03:56:59 -0700 Subject: [PATCH 8/8] ADLR/megatron-lm!2270 - ci: Faster unit tests --- .gitlab-ci.yml | 14 ++--- .gitlab/stages/00.pre.yml | 7 +-- .gitlab/stages/01.test.yml | 60 ++++++++++++------- .../shell_test_utils/run_ci_test.sh | 4 +- 4 files changed, 50 insertions(+), 35 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 83d432ea71..649ffb447b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -13,8 +13,8 @@ workflow: FUNCTIONAL_TEST: "no" - if: $CI_MERGE_REQUEST_LABELS =~ /Run tests/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != "" variables: - UNIT_TEST_REPEAT: 5 - UNIT_TEST_TIMEOUT: 75 + UNIT_TEST_REPEAT: 1 + UNIT_TEST_TIMEOUT: 10 FUNCTIONAL_TEST: "yes" FUNCTIONAL_TEST_SCOPE: mr FUNCTIONAL_TEST_REPEAT: 5 @@ -24,8 +24,8 @@ workflow: PUBLISH: "no" - if: $CI_MERGE_REQUEST_LABELS =~ /Run nightly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != "" variables: - UNIT_TEST_REPEAT: 5 - UNIT_TEST_TIMEOUT: 75 + UNIT_TEST_REPEAT: 1 + UNIT_TEST_TIMEOUT: 10 FUNCTIONAL_TEST: "yes" FUNCTIONAL_TEST_SCOPE: nightly FUNCTIONAL_TEST_REPEAT: 5 @@ -35,8 +35,8 @@ workflow: PUBLISH: "no" - if: $CI_MERGE_REQUEST_LABELS =~ /Run weekly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != "" variables: - UNIT_TEST_REPEAT: 5 - UNIT_TEST_TIMEOUT: 75 + UNIT_TEST_REPEAT: 1 + UNIT_TEST_TIMEOUT: 10 FUNCTIONAL_TEST: "yes" FUNCTIONAL_TEST_SCOPE: weekly FUNCTIONAL_TEST_REPEAT: 1 @@ -71,7 +71,7 @@ variables: value: "1" description: "Number of repetitions" UNIT_TEST_TIMEOUT: - value: "15" + value: "10" description: Timeout (minutes) for Unit tests (all repeats) FUNCTIONAL_TEST: value: "yes" diff --git a/.gitlab/stages/00.pre.yml b/.gitlab/stages/00.pre.yml index 82cc9514f1..1b9e453554 100644 --- a/.gitlab/stages/00.pre.yml +++ b/.gitlab/stages/00.pre.yml @@ -41,10 +41,9 @@ pre:create_ci_branches: matrix: - branch: ci-unit-test-extended - branch: ci-rebuild-mcore-nemo-image - - branch: ci-mr-a100 - - branch: ci-nightly-a100 - - branch: ci-weekly-a100 - - branch: ci-weekly-h100 + - branch: ci-mr + - branch: ci-nightly + - branch: ci-weekly - branch: ci-pre-release tags: [mcore-docker-node-small] stage: .pre diff --git a/.gitlab/stages/01.test.yml b/.gitlab/stages/01.test.yml index ca55de7d84..c12b5175ab 100644 --- a/.gitlab/stages/01.test.yml +++ b/.gitlab/stages/01.test.yml @@ -90,28 +90,50 @@ test:build_image: tags: [8xL40S] variables: GIT_STRATEGY: none + parallel: + matrix: + - BUCKET: tests/unit_tests/data/ + - BUCKET: tests/unit_tests/dist_checkpointing/ + - BUCKET: tests/unit_tests/distributed/ + - BUCKET: tests/unit_tests/models/ + - BUCKET: tests/unit_tests/pipeline_parallel/ tests/unit_tests/tensor_parallel/ + - BUCKET: tests/unit_tests/transformer/ + - BUCKET: other script: - - if [ $UNIT_TEST_REPEAT -eq 0 ]; then exit 0; fi; - - docker run --name mcore_ci_${CI_PIPELINE_ID} -d --rm -e TAG -e UNIT_TEST_REPEAT -e UNIT_TEST_TIMEOUT --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 ${IMAGE}:${CI_PIPELINE_ID} bash -c "sleep $(( ${UNIT_TEST_TIMEOUT} * 60 + 60 ))" + - docker run --name mcore_ci_${CI_PIPELINE_ID} -d --rm -e BUCKET -e TAG -e UNIT_TEST_REPEAT -e UNIT_TEST_TIMEOUT --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 ${IMAGE}:${CI_PIPELINE_ID} bash -c "sleep $(( ${UNIT_TEST_TIMEOUT} * 60 + 60 ))" - | - docker exec mcore_ci_${CI_PIPELINE_ID} bash -c ' - set -e + CMD=$(cat <<"RUN_TEST_EOF" + set -euxo pipefail - MCORE_DIR=$([[ "$TAG" == "latest" ]] && echo "" || echo "-$TAG/") + MCORE_DIR=$([[ "$TAG" == "latest" ]] && echo "" || echo "-$TAG/") - cd /opt/megatron-lm$MCORE_DIR; + cd /opt/megatron-lm$MCORE_DIR; - for i in $(seq $UNIT_TEST_REPEAT); do - SEED=$((RANDOM % 9000 + 1000)); - ARGS=() - if [[ $TAG != latest ]]; then - ARGS+=(-m "not internal and not flaky and not flaky_in_dev") - else - ARGS+=(-m "not flaky and not flaky_in_dev") - fi - timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest --random-order --random-order-seed ${SEED} -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${ARGS[@]}" tests/unit_tests - done - ' + for i in $(seq $UNIT_TEST_REPEAT); do + SEED=$((RANDOM % 9000 + 1000)); + ARGS=() + if [[ $TAG != latest ]]; then + ARGS+=(-m "not internal and not flaky and not flaky_in_dev") + else + ARGS+=(-m "not flaky and not flaky_in_dev") + fi + + if [[ $BUCKET == other ]]; then + BUCKETS=($(cat /opt/megatron-lm/.gitlab/stages/01.test.yml | yq '.".unit_tests".parallel.matrix | del(.[] | select(.BUCKET == "other")) | .[].BUCKET' | tr " " "\n" | sed 's/[^ ]*/--ignore &/g' | tr "\n" " ")) + ARGS+=(${BUCKETS[@]}) + BUCKET=(tests/unit_tests) + else + BUCKET=(${BUCKET}) + fi + + if [[ -d $BUCKET ]]; then + timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${ARGS[@]}" ${BUCKET[@]} + fi + done + RUN_TEST_EOF + ) + + docker exec mcore_ci_${CI_PIPELINE_ID} bash -c "$CMD" after_script: - docker container stop mcore_ci_${CI_PIPELINE_ID} || true artifacts: @@ -135,8 +157,6 @@ test:pyt(LTS)_mcore(0.9.0): variables: TAG: core_r0.9.0 IMAGE: ${CI_MCORE_LTS_IMAGE} - UNIT_TEST_REPEAT: 1 - UNIT_TEST_TIMEOUT: 15 test:pyt(DEV)_mcore(latest): extends: [.unit_tests] @@ -149,8 +169,6 @@ test:pyt(DEV)_mcore(0.9.0): variables: TAG: core_r0.9.0 IMAGE: ${CI_MCORE_DEV_IMAGE} - UNIT_TEST_REPEAT: 1 - UNIT_TEST_TIMEOUT: 15 test:notify_unit_tests: extends: [.test_rules] diff --git a/tests/functional_tests/shell_test_utils/run_ci_test.sh b/tests/functional_tests/shell_test_utils/run_ci_test.sh index 9dc22e3929..fac0704b4c 100644 --- a/tests/functional_tests/shell_test_utils/run_ci_test.sh +++ b/tests/functional_tests/shell_test_utils/run_ci_test.sh @@ -42,10 +42,8 @@ NVTE_ALLOW_NONDETERMINISTIC_ALGO=$(cat $TRAINING_PARAMS_PATH \ | yq '.ENV_VARS.NVTE_ALLOW_NONDETERMINISTIC_ALGO') SKIP_PYTEST=$(cat $TRAINING_PARAMS_PATH \ | yq '.ENV_VARS.SKIP_PYTEST') -N_REPEATS=$(cat $TRAINING_PARAMS_PATH \ - | yq '.ENV_VARS.N_REPEATS //1') -for i in $(seq 1 $N_REPEATS); +for i in $(seq 1 $N_REPEAT); do if [[ $i -gt 1 ]]; then rm -rf $CHECKPOINT_PATH/*