From 2501d5282dc3ccd2379f43b75cc5b3289294b88f Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Fri, 25 Oct 2024 08:51:35 -0700
Subject: [PATCH 1/8] ADLR/megatron-lm!2262 - ci: Allow dry-run of publish

---
 .gitlab-ci.yml                |  17 +++-
 .gitlab/stages/01.test.yml    | 157 ++++++++++++++++++++++++++++++++--
 .gitlab/stages/03.publish.yml | 110 ++++++++++++------------
 3 files changed, 217 insertions(+), 67 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index d261ed34b8..1f01679099 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -19,6 +19,7 @@ workflow:
         FUNCTIONAL_TEST_SCOPE: mr
         FUNCTIONAL_TEST_CLUSTER_A100: ""
         FUNCTIONAL_TEST_CLUSTER_H100: ""
+        PUBLISH: "no"
     - if: $CI_MERGE_REQUEST_LABELS =~ /Run nightly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
       variables:
         UNIT_TEST_REPEAT: 5
@@ -27,6 +28,7 @@ workflow:
         FUNCTIONAL_TEST_SCOPE: nightly
         FUNCTIONAL_TEST_CLUSTER_A100: ""
         FUNCTIONAL_TEST_CLUSTER_H100: ""
+        PUBLISH: "no"
     - if: $CI_MERGE_REQUEST_LABELS =~ /Run weekly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
       variables:
         UNIT_TEST_REPEAT: 5
@@ -35,9 +37,11 @@ workflow:
         FUNCTIONAL_TEST_SCOPE: weekly
         FUNCTIONAL_TEST_CLUSTER_A100: ""
         FUNCTIONAL_TEST_CLUSTER_H100: ""
+        PUBLISH: "no"
     - if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
       variables:
         FUNCTIONAL_TEST: "no"
+        PUBLISH: "no"
     - when: never
   auto_cancel:
     on_new_commit: interruptible
@@ -45,19 +49,24 @@ workflow:
 stages:
   - test 
   - functional_tests
-  - convergence_tests
   - publish
 
 default:
   interruptible: true
 
 variables:
-  UNIT_TEST_TIMEOUT: 
-    value: "15"
-    description: Timeout (minutes) for Unit tests (all repeats)
+  UNIT_TEST:
+    value: "yes"
+    options:
+      - "yes"
+      - "no"
+    description: To run the funtional test suite
   UNIT_TEST_REPEAT:
     value: "1"
     description: "Number of repetitions"
+  UNIT_TEST_TIMEOUT: 
+    value: "15"
+    description: Timeout (minutes) for Unit tests (all repeats)
   FUNCTIONAL_TEST: 
     value: "yes"
     options:
diff --git a/.gitlab/stages/01.test.yml b/.gitlab/stages/01.test.yml
index 079f3695fb..ca55de7d84 100644
--- a/.gitlab/stages/01.test.yml
+++ b/.gitlab/stages/01.test.yml
@@ -1,6 +1,6 @@
 .test_rules:
   rules:
-    - if: $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
+    - if: $UNIT_TEST == 'yes' && $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
       allow_failure: true
       when: on_success
     - when: on_success
@@ -46,7 +46,7 @@ test:build_image:
       
         ADDITIONAL_PARAMS=()
 
-        if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then
+        if [[ "$CI_COMMIT_BRANCH" == "ci-rebuild-mcore-nemo-image" ]]; then
           ADDITIONAL_PARAMS+=("--pull")
           ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:main")
         fi
@@ -118,10 +118,10 @@ test:build_image:
     paths:
       - coverage
   rules:
-    - if: $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true" && $UNIT_TEST_REPEAT != '0'
+    - if: $UNIT_TEST == 'yes' && $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true" && $UNIT_TEST_REPEAT != '0'
       allow_failure: true
       when: on_success
-    - if: $UNIT_TEST_REPEAT != '0'
+    - if: $UNIT_TEST == 'yes' && $UNIT_TEST_REPEAT != '0'
       when: on_success
 
 test:pyt(LTS)_mcore(latest):
@@ -135,6 +135,8 @@ test:pyt(LTS)_mcore(0.9.0):
   variables:
     TAG: core_r0.9.0
     IMAGE: ${CI_MCORE_LTS_IMAGE}
+    UNIT_TEST_REPEAT: 1
+    UNIT_TEST_TIMEOUT: 15
 
 test:pyt(DEV)_mcore(latest):
   extends: [.unit_tests]
@@ -147,8 +149,10 @@ test:pyt(DEV)_mcore(0.9.0):
   variables:
     TAG: core_r0.9.0
     IMAGE: ${CI_MCORE_DEV_IMAGE}
+    UNIT_TEST_REPEAT: 1
+    UNIT_TEST_TIMEOUT: 15
 
-test:notify:
+test:notify_unit_tests:
   extends: [.test_rules]
   image: ${CI_MCORE_LTS_IMAGE}:${CI_PIPELINE_ID}
   needs:
@@ -229,4 +233,145 @@ test:secret_detection:
         echo "Atleast one vulnerability has been found"
         cat gl-secret-detection-report.json | jq '.'
         exit 1
-      fi
\ No newline at end of file
+      fi
+
+test:pypi_build_wheel:
+  extends: [.test_rules]
+  image: 
+    name: quay.io/pypa/manylinux_2_28_x86_64  
+    entrypoint: [""]
+  tags: [mcore-docker-node-small]
+  variables:
+    PUBLISH_DRYRUN: "yes"
+  script:
+    - echo $PUBLISH_DRYRUN
+    - >
+      if [ "$PUBLISH_DRYRUN" = "yes" ]; then
+        sed -i "/^PATCH/c\PATCH = $((RANDOM % 9000 + 1000))" megatron/core/package_info.py 
+      fi
+    - /opt/python/cp310-cp310/bin/python -m build
+    - /opt/python/cp311-cp311/bin/python -m build
+    - auditwheel repair dist/*.whl
+  artifacts:
+    paths:
+      - megatron/core/package_info.py 
+      - wheelhouse/
+
+test:pypi_test_wheel:
+  extends: [.test_rules]
+  image: nvcr.io/nvidia/pytorch:24.01-py3
+  needs: [test:pypi_build_wheel]
+  tags: [mcore-docker-node-small]
+  variables:
+    PUBLISH_DRYRUN: "yes"
+  script:
+    - EXPECTED_RELEASE_NUMBER=$(python -c "from megatron import core; print(core.__version__)")
+    - rm -rf megatron
+    - pip install wheelhouse/*cp310*.whl
+
+    - RELEASE_NUMBER=$(python -c "from megatron import core; print(core.__version__)")
+    - >
+      echo "$EXPECTED_RELEASE_NUMBER" == "$RELEASE_NUMBER"
+    - test "$EXPECTED_RELEASE_NUMBER" == "$RELEASE_NUMBER"
+  artifacts:
+    paths:
+      - wheelhouse/
+
+test:pypi_push_wheel:
+  extends: [.test_rules]
+  image: python:3.10
+  tags: [mcore-docker-node-small]
+  needs: [test:pypi_test_wheel]
+  variables:
+    PUBLISH_DRYRUN: "yes"
+  script:
+    - >
+      if [ "$PUBLISH_DRYRUN" = "yes" ]; then
+        REPOSITORY=testpypi
+        export TWINE_USERNAME=$TWINE_TEST_USERNAME
+        export TWINE_PASSWORT=$TWINE_TEST_PASSWORD
+      else
+        REPOSITORY=pypi
+        export TWINE_USERNAME=$TWINE_PROD_USERNAME
+        export TWINE_PASSWORT=$TWINE_PROD_PASSWORD
+      fi
+    - pip install twine
+    - twine upload -u $TWINE_USERNAME -p $TWINE_PASSWORT --repository $REPOSITORY wheelhouse/*
+
+test:gh_release:
+  extends: [.test_rules]
+  tags: [mcore-docker-node-small]
+  image: nvcr.io/nvidia/pytorch:24.01-py3
+  variables:
+    PUBLISH_DRYRUN: "yes"
+  script: 
+    - RELEASE_NUMBER=$(python -c "from megatron import core; print(core.__version__)")
+    - NAME="NVIDIA Megatron Core $RELEASE_NUMBER"
+    - CHANGELOG=$(awk '/^## '"$NAME"'/{flag=1; next} /^## /{flag=0} flag' CHANGELOG.md)
+    - CHANGELOG=$(echo "$CHANGELOG" | sed '/./!d')
+    - >
+      PAYLOAD=$(jq -nc \
+                  --arg CI_COMMIT_BRANCH "$CI_COMMIT_BRANCH" \
+                  --arg NAME "$NAME" \
+                  --arg BODY "$CHANGELOG" \
+                  '{
+                      "tag_name": $CI_COMMIT_BRANCH,
+                      "target_commitish": $CI_COMMIT_BRANCH,
+                      "name": $NAME,
+                      "body": $BODY,
+                      "draft": false,
+                      "prerelease": false,
+                      "generate_release_notes": false
+                  }'
+              )
+    - >
+      CMD=$(echo curl -L \
+        -X POST \
+        -H "Accept: application/vnd.github+json" \
+        -H "Authorization: Bearer $GH_TOKEN" \
+        -H "X-GitHub-Api-Version: 2022-11-28" \
+        https://api.github.com/repos/NVIDIA/Megatron-LM/releases \
+        -d "$PAYLOAD"
+      )
+
+      if [[ "$PUBLISH_DRYRUN" == "yes" ]]; then
+        echo "$CMD"
+      else
+        eval "$CMD"
+      fi
+
+test:notify_release:
+  needs: [test:pypi_push_wheel, test:gh_release]
+  extends: [.test_rules]
+  image: nvcr.io/nvidia/pytorch:24.01-py3
+  tags: [mcore-docker-node-small]
+  variables:
+    PUBLISH_DRYRUN: "yes"
+  script:
+    - VERSION=$(python -c "from megatron import core; print(core.__version__)")
+    - URL="https://github.com/NVIDIA/Megatron-LM/releases/tag/core_r$VERSION"          
+    - >
+      MESSAGE='{
+          "blocks": [
+            {
+              "type": "section",
+              "text": {
+                "type": "mrkdwn",
+                    "text": "Releasebot 🤖: Megatron-Core released <'$URL'|core_r'$VERSION'> 🚀"
+              }
+            }
+          ]
+        }'
+    - echo "$MESSAGE"
+    - >
+      CMD=$(echo curl \
+        -X POST \
+        -H "Content-type: application/json" \
+        --data "$MESSAGE" ${MCORE_NOTIFICATION_HOOK_MAIN}
+      )
+
+      if [[ "$PUBLISH_DRYRUN" == "yes" ]]; then
+        echo "$CMD"
+      else
+        eval "$CMD"
+      fi
diff --git a/.gitlab/stages/03.publish.yml b/.gitlab/stages/03.publish.yml
index e1ee94bd19..4639d7690f 100644
--- a/.gitlab/stages/03.publish.yml
+++ b/.gitlab/stages/03.publish.yml
@@ -1,24 +1,28 @@
 .publish_common_freeze:
-  stage: functional_tests
+  stage: publish
   rules:
     - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH && $PUBLISH == "yes" && $PUBLISH_SCOPE == "code-freeze"
       when: manual
     - when: never
   
 .publish_common_release:
-  stage: functional_tests
+  stage: publish
   rules:
     - if: $CI_COMMIT_BRANCH =~ /^core_r/ && $PUBLISH == "yes" && $PUBLISH_SCOPE == "release"
       when: manual
+    - if: $PUBLISH == "yes" && $PUBLISH_SCOPE == "release"
+      when: manual
+      variables:
+        PUBLISH_DRYRUN: "yes"
     - when: never
 
-create-release-branch:
+publish:release_branch:
   extends: [.publish_common_freeze]
   image: ${CI_MCORE_LTS_IMAGE}:${CI_PIPELINE_ID}
   needs: [test:build_image]
   tags: [mcore-docker-node-small]
   variables:
-    GIT_STRATEGY: "clone"
+    GIT_STRATEGY: "none"
   script:
     - git fetch origin $CI_DEFAULT_BRANCH
     - git config --global user.email "mcore-bot@nvidia.com"
@@ -26,8 +30,8 @@ create-release-branch:
     - git remote set-url origin "https://gitlab-ci-token:${PAT}@${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git"
     - sed -i "/^PRE_RELEASE/c\PRE_RELEASE = ''" megatron/core/package_info.py 
     - VERSION=$(python -c "from megatron import core; print(core.__version__)")
-    - git switch --force-create core_r$VERSION origin/$CI_DEFAULT_BRANCH
-    - git push -u origin core_r$VERSION --force
+    - RELEASE_BRANCH=core_r$VERSION
+    - git switch --force-create $RELEASE_BRANCH origin/$CI_DEFAULT_BRANCH
     - |
       MESSAGE='{
         "blocks": [
@@ -35,61 +39,53 @@ create-release-branch:
             "type": "section",
             "text": {
               "type": "mrkdwn",
-              "text": "Releasebot 🤖: Megatron Core has been frozen 🎉 to branch `core_r$VERSION`"
+              "text": "Releasebot 🤖: Megatron Core has been frozen 🎉 to branch `'"$RELEASE_BRANCH"'`"
             }
           }
         ]
       }'
-
+    - >
       curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${MCORE_NOTIFICATION_HOOK_MAIN}
+    - git switch --force-create bot/chore/bump-version 
+    - git add megatron/core/package_info.py 
+    - >
+      git commit -m "chore: adjust version version"
+    - git push -u origin bot/chore/bump-version 
+    - >
+      curl \
+        --header "PRIVATE-TOKEN: $PAT" \
+        --url https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests \
+        -d "source_branch=bot/chore/bump-version" \
+        -d "target_branch=$RELEASE_BRANCH" \
+        -d "title=chore: Fix version of \`$RELEASE_BRANCH\`" \
+        -d "description=[🤖]: Hi @okoenig 👋,<br><br>we've adjusted the version number of \`$RELEASE_BRANCH\` for you! 🚀<br><br>Please review and approve this cherry pick by your convenience\!"
 
-publish-wheel:
-  extends: [.publish_common_release]
-  image: quay.io/pypa/manylinux_2_28_x86_64  
-  tags: [mcore-docker-node-small]
-  script:
-    - export TWINE_USERNAME
-    - export TWINE_PASSWORT
-    - /opt/python/cp311-cp311/bin/pip install twine
-    - /opt/python/cp310-cp310/bin/python -m build
-    - /opt/python/cp311-cp311/bin/python -m build
-    - auditwheel repair dist/*.whl
-    - twine upload --repository pypi wheelhouse/*
-
-create-gh-release:
-  extends: [.publish_common_release]
-  tags: [mcore-docker-node-small]
-  image:
-    name: registry.gitlab.com/gitlab-ci-utils/curl-jq
-    entrypoint: [""]
-  script: 
-    - |
-      RELEASE_NUMBER=$(python -c "from megatron import core; print(core.__version__)")
-      NAME="NVIDIA Megatron Core $RELEASE_NUMBER"
-      CHANGELOG=$(awk '/^## '$NAME'/{flag=1; next} /^## /{flag=0} flag' CHANGELOG.md)
-      CHANGELOG=$(echo "$CHANGELOG" | sed '/./!d')
-
-      PAYLOAD=$(jq \
-                  -n \
-                  -c \
-                  --arg CI_COMMIT_BRANCH "$CI_COMMIT_BRANCH" \
-                  --arg NAME "$NAME" \
-                  --arg BODY "$CHANGELOG" \
-                  '{
-                    "tag_name": $CI_COMMIT_BRANCH,
-                    "target_commitish": $CI_COMMIT_BRANCH,
-                    "name": $NAME,
-                    "body": $BODY,
-                    "draft": false,
-                    "prerelease": false,
-                    "generate_release_notes": false
-                  }'
-               )
+publish:pypi_build_wheel:
+  extends: [test:pypi_build_wheel, .publish_common_release]
+  dependencies: []
+  variables:
+    PUBLISH_DRYRUN: "no"
+  
+publish:pypi_test_wheel:
+  extends: [test:pypi_test_wheel, .publish_common_release]
+  needs: [publish:pypi_build_wheel]
+  variables:
+    PUBLISH_DRYRUN: "no"
+  
+publish:pypi_push_wheel:
+  extends: [test:pypi_push_wheel, .publish_common_release]
+  needs: [publish:pypi_test_wheel]
+  variables:
+    PUBLISH_DRYRUN: "no"
 
-      curl -L \
-        -X POST \
-        -H "Accept: application/vnd.github+json" \
-        -H "Authorization: Bearer $GH_TOKEN" \
-        -H "X-GitHub-Api-Version: 2022-11-28" \
-        https://api.github.com/repos/NVIDIA/Megatron-LM/releases \
-        -d $PAYLOAD
\ No newline at end of file
+publish:gh_release:
+  extends: [test:gh_release, .publish_common_release]
+  dependencies: []
+  variables:
+    PUBLISH_DRYRUN: "no"
+      
+publish:notify_release:
+  needs: [publish:pypi_push_wheel, publish:gh_release]
+  extends: [test:notify_release, .publish_common_release]
+  variables:
+    PUBLISH_DRYRUN: "no"
\ No newline at end of file

From 8bac43ac38b8e57828601fdd39e2b6bef6919108 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Fri, 25 Oct 2024 09:24:02 -0700
Subject: [PATCH 2/8] ADLR/megatron-lm!2265 - ci: Fix notifications

---
 tests/functional_tests/jet_recipes/gpt.yaml                 | 6 +++---
 tests/functional_tests/jet_recipes/t5.yaml                  | 4 ++--
 tests/functional_tests/shell_test_utils/notify.sh           | 2 +-
 .../functional_tests/shell_test_utils/notify_unit_tests.sh  | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/functional_tests/jet_recipes/gpt.yaml b/tests/functional_tests/jet_recipes/gpt.yaml
index 04791f0ef2..196c3372c9 100644
--- a/tests/functional_tests/jet_recipes/gpt.yaml
+++ b/tests/functional_tests/jet_recipes/gpt.yaml
@@ -55,7 +55,6 @@ products:
     - gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G
     - gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G
     - gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G
     - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G
     - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G
     - gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G
@@ -73,10 +72,9 @@ products:
     - gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G
     - gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G
     - gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G
+    # - gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G
     - gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G
     - gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G
     - gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G
     - gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G
     - gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G
@@ -139,11 +137,13 @@ products:
     platforms: [dgx_a100]
     time_limit: [1800]
     test_case:
+    - gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G
     - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G
     - gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G
     - gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G
     - gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G
     - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G
   - environment: [lts]
     scope: [weekly]
     platforms: [dgx_h100]
diff --git a/tests/functional_tests/jet_recipes/t5.yaml b/tests/functional_tests/jet_recipes/t5.yaml
index 85c28c9403..6635199025 100644
--- a/tests/functional_tests/jet_recipes/t5.yaml
+++ b/tests/functional_tests/jet_recipes/t5.yaml
@@ -36,14 +36,14 @@ products:
     - t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G
     - t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G
     - t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G
-    - t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G
     - t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G
     - t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G
-    - t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G
   - environment: [lts]
     scope: [mr]
     time_limit: [1800]
     test_case:
+    - t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G
+    - t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G
     - t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G
   - environment: [lts]
     scope: [weekly]
diff --git a/tests/functional_tests/shell_test_utils/notify.sh b/tests/functional_tests/shell_test_utils/notify.sh
index cbdc0e7030..4fa9d5deae 100644
--- a/tests/functional_tests/shell_test_utils/notify.sh
+++ b/tests/functional_tests/shell_test_utils/notify.sh
@@ -48,7 +48,7 @@ if [[ ${ret_code:-0} -ne 0 ]]; then
 fi
 
 # Fetch GitLab logs of JET downstream pipeline
-DOWNSTREAM_PIPELINE_ID=$(jq --arg environment "$ENVIRONMENT" '.[] |select(.name == "jet-trigger-" + $environment) | .downstream_pipeline.id' <<< "$PIPELINE_JSON")
+DOWNSTREAM_PIPELINE_ID=$(jq --arg environment "$ENVIRONMENT" '.[] |select(.name == "functional:run_" + $environment) | .downstream_pipeline.id' <<< "$PIPELINE_JSON")
 
 PIPELINE_URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/pipelines/$CI_PIPELINE_ID
 JOB_URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/jobs/
diff --git a/tests/functional_tests/shell_test_utils/notify_unit_tests.sh b/tests/functional_tests/shell_test_utils/notify_unit_tests.sh
index 86cb29b772..e16f8d81f9 100644
--- a/tests/functional_tests/shell_test_utils/notify_unit_tests.sh
+++ b/tests/functional_tests/shell_test_utils/notify_unit_tests.sh
@@ -48,7 +48,7 @@ if [[ ${ret_code:-0} -ne 0 ]]; then
     exit 1
 fi
 
-UNIT_TESTS_JOBS=$(echo -E $PIPELINE_JSON | jq '[.[] | select(.name | startswith("test:unit_tests_"))]')
+UNIT_TESTS_JOBS=$(echo -E $PIPELINE_JSON | jq '[.[] | select(.name | startswith("test:pyt"))]')
 
 if [[ $UNIT_TESTS_JOBS == null ]]; then
     FAILED_JOBS=$(curl \

From ef6cba6d0171907c637f840e0dff344fc70569b5 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Sat, 26 Oct 2024 02:59:11 -0700
Subject: [PATCH 3/8] ADLR/megatron-lm!2266 - ci: Move REPEATS to launcher
 level

---
 .gitlab-ci.yml                                | 12 +++++
 .gitlab/stages/02.functional-tests.yml        |  4 ++
 tests/functional_tests/jet_recipes/bert.yaml  |  3 ++
 .../jet_recipes/gpt-nemo.yaml                 |  2 +
 tests/functional_tests/jet_recipes/gpt.yaml   |  4 ++
 .../jet_recipes/multimodal-llava.yaml         |  2 +
 tests/functional_tests/jet_recipes/t5.yaml    |  4 ++
 .../python_test_utils/jet/common.py           |  4 ++
 .../jet/generate_jet_trigger_job.py           |  6 +++
 .../jet/launch_jet_workload.py                | 10 +++++
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  5 +--
 .../bert/bert_release/model_config.yaml       | 44 ++++++++-----------
 .../common/ckpt_converter/model_config.yaml   |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../gpt/gpt3_15b_8t_release/model_config.yaml | 15 +------
 .../gpt3_15b_8t_release_sm/model_config.yaml  | 15 +------
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  3 +-
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  2 +-
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         | 14 ------
 .../model_config.yaml                         | 16 +------
 .../model_config.yaml                         | 16 +------
 .../model_config.yaml                         | 14 ------
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  1 -
 .../model_config.yaml                         |  2 +-
 .../model_config.yaml                         |  2 +-
 .../model_config.yaml                         |  2 +-
 .../model_config.yaml                         |  2 +-
 .../model_config.yaml                         |  2 +-
 .../model_config.yaml                         |  2 +-
 .../t5/t5_release/model_config.yaml           | 33 ++++++--------
 .../test_flattened_resharding.py              |  1 +
 156 files changed, 130 insertions(+), 303 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 1f01679099..06334601b4 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -17,6 +17,8 @@ workflow:
         UNIT_TEST_TIMEOUT: 75
         FUNCTIONAL_TEST: "yes"
         FUNCTIONAL_TEST_SCOPE: mr
+        FUNCTIONAL_TEST_REPEAT: 5
+        FUNCTIONAL_TEST_TIME_LIMIT: 1800,
         FUNCTIONAL_TEST_CLUSTER_A100: ""
         FUNCTIONAL_TEST_CLUSTER_H100: ""
         PUBLISH: "no"
@@ -26,6 +28,8 @@ workflow:
         UNIT_TEST_TIMEOUT: 75
         FUNCTIONAL_TEST: "yes"
         FUNCTIONAL_TEST_SCOPE: nightly
+        FUNCTIONAL_TEST_REPEAT: 5
+        FUNCTIONAL_TEST_TIME_LIMIT: 1800,
         FUNCTIONAL_TEST_CLUSTER_A100: ""
         FUNCTIONAL_TEST_CLUSTER_H100: ""
         PUBLISH: "no"
@@ -35,6 +39,8 @@ workflow:
         UNIT_TEST_TIMEOUT: 75
         FUNCTIONAL_TEST: "yes"
         FUNCTIONAL_TEST_SCOPE: weekly
+        FUNCTIONAL_TEST_REPEAT: 1,
+        FUNCTIONAL_TEST_TIME_LIMIT: 9000,
         FUNCTIONAL_TEST_CLUSTER_A100: ""
         FUNCTIONAL_TEST_CLUSTER_H100: ""
         PUBLISH: "no"
@@ -82,6 +88,12 @@ variables:
       - "pre-release"
       - "release"
     description: "Testsuite to run (only for FUNCTIONAL_TEST=yes)"
+  FUNCTIONAL_TEST_REPEAT:
+    value: "5"
+    description: "Number of repetitions per test"
+  FUNCTIONAL_TEST_TIME_LIMIT:
+    value: "1800"
+    description: "Timeout in seconds per test"
   FUNCTIONAL_TEST_CLUSTER_A100:
     value: "dgxa100_dracooci"
     options:
diff --git a/.gitlab/stages/02.functional-tests.yml b/.gitlab/stages/02.functional-tests.yml
index 07f4966734..db49c99c60 100644
--- a/.gitlab/stages/02.functional-tests.yml
+++ b/.gitlab/stages/02.functional-tests.yml
@@ -57,6 +57,8 @@ functional:configure:
       python tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py \
         --scope $FUNCTIONAL_TEST_SCOPE \
         --environment dev \
+        --n-repeat "$FUNCTIONAL_TEST_REPEAT" \
+        --time-limit "$FUNCTIONAL_TEST_TIME_LIMIT" \
         --a100-cluster $A100_CLUSTER \
         --h100-cluster $H100_CLUSTER \
         --container-image ${CI_MCORE_LTS_IMAGE} \
@@ -68,6 +70,8 @@ functional:configure:
       python tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py \
         --scope $FUNCTIONAL_TEST_SCOPE \
         --environment lts \
+        --n-repeat "$FUNCTIONAL_TEST_REPEAT" \
+        --time-limit "$FUNCTIONAL_TEST_TIME_LIMIT" \
         --a100-cluster $A100_CLUSTER \
         --h100-cluster $H100_CLUSTER \
         --container-image ${CI_MCORE_LTS_IMAGE} \
diff --git a/tests/functional_tests/jet_recipes/bert.yaml b/tests/functional_tests/jet_recipes/bert.yaml
index 30349d708d..89a097641e 100644
--- a/tests/functional_tests/jet_recipes/bert.yaml
+++ b/tests/functional_tests/jet_recipes/bert.yaml
@@ -24,6 +24,7 @@ spec:
         "TRAINING_SCRIPT_PATH=pretrain_bert.py"
         "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
         "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
+        "N_REPEAT={n_repeat}"
     )
 
     bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
@@ -32,6 +33,7 @@ products:
   - environment: [lts, dev]
     scope: [mr]
     time_limit: [1800]
+    n_repeat: [5]
     test_case: 
     - bert_mr_mcore_tp2_pp2_dgx_a100_1N8G
     # - bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G
@@ -43,6 +45,7 @@ products:
     - bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G
   - environment: [lts]
     scope: [nightly]
+    n_repeat: [5]
     time_limit: [3600]
     test_case:
     - bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2
diff --git a/tests/functional_tests/jet_recipes/gpt-nemo.yaml b/tests/functional_tests/jet_recipes/gpt-nemo.yaml
index 366cae1f21..01e79b4793 100644
--- a/tests/functional_tests/jet_recipes/gpt-nemo.yaml
+++ b/tests/functional_tests/jet_recipes/gpt-nemo.yaml
@@ -24,6 +24,7 @@ spec:
         "TRAINING_SCRIPT_PATH=/opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py"
         "TRAINING_PARAMS_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
         "GOLDEN_VALUES_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
+        "N_REPEAT={n_repeat}"
     )
 
     bash /opt/megatron-lm/tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
@@ -31,6 +32,7 @@ spec:
 products:
   - environment: [dev]
     scope: [mr]
+    n_repeat: [5]
     test_case:
     - gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G
     - gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G
diff --git a/tests/functional_tests/jet_recipes/gpt.yaml b/tests/functional_tests/jet_recipes/gpt.yaml
index 196c3372c9..32ee90109b 100644
--- a/tests/functional_tests/jet_recipes/gpt.yaml
+++ b/tests/functional_tests/jet_recipes/gpt.yaml
@@ -23,6 +23,7 @@ spec:
         "TRAINING_SCRIPT_PATH=pretrain_gpt.py"
         "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
         "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
+        "N_REPEAT={n_repeat}"
     )
 
     bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
@@ -32,6 +33,7 @@ products:
     scope: [mr]
     platforms: [dgx_a100]
     time_limit: [1800]
+    n_repeat: [5]
     test_case:
     - gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G
     - gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G
@@ -103,6 +105,7 @@ products:
     scope: [nightly]
     platforms: [dgx_a100]
     time_limit: [3600]
+    n_repeat: [5]
     test_case:
     - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather
     - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather
@@ -136,6 +139,7 @@ products:
     scope: [mr]
     platforms: [dgx_a100]
     time_limit: [1800]
+    n_repeat: [5]
     test_case:
     - gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G
     - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G
diff --git a/tests/functional_tests/jet_recipes/multimodal-llava.yaml b/tests/functional_tests/jet_recipes/multimodal-llava.yaml
index 981404db64..a6202e4910 100644
--- a/tests/functional_tests/jet_recipes/multimodal-llava.yaml
+++ b/tests/functional_tests/jet_recipes/multimodal-llava.yaml
@@ -24,6 +24,7 @@ spec:
         "TRAINING_SCRIPT_PATH=pretrain_vlm.py"
         "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
         "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
+        "N_REPEAT={n_repeat}"
     )
 
     bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
@@ -31,6 +32,7 @@ spec:
 products:
   - environment: [lts, dev]
     scope: [mr]
+    n_repeat: [5]
     test_case:
     - multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G
     - multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G
diff --git a/tests/functional_tests/jet_recipes/t5.yaml b/tests/functional_tests/jet_recipes/t5.yaml
index 6635199025..eb76892661 100644
--- a/tests/functional_tests/jet_recipes/t5.yaml
+++ b/tests/functional_tests/jet_recipes/t5.yaml
@@ -24,6 +24,7 @@ spec:
         "TRAINING_SCRIPT_PATH=pretrain_t5.py"
         "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
         "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
+        "N_REPEAT={n_repeat}"
     )
 
     bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
@@ -32,6 +33,7 @@ products:
   - environment: [lts, dev]
     scope: [mr]
     time_limit: [1800]
+    n_repeat: [5]
     test_case:
     - t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G
     - t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G
@@ -41,6 +43,7 @@ products:
   - environment: [lts]
     scope: [mr]
     time_limit: [1800]
+    n_repeat: [5]
     test_case:
     - t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G
     - t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G
@@ -48,6 +51,7 @@ products:
   - environment: [lts]
     scope: [weekly]
     time_limit: [9000]
+    n_repeat: [1]
     test_case:
     - t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch
     - t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1
diff --git a/tests/functional_tests/python_test_utils/jet/common.py b/tests/functional_tests/python_test_utils/jet/common.py
index eed22752c6..9313e0a59c 100644
--- a/tests/functional_tests/python_test_utils/jet/common.py
+++ b/tests/functional_tests/python_test_utils/jet/common.py
@@ -134,6 +134,8 @@ def filter_by_model(
 
 def load_workloads(
     container_tag: str,
+    n_repeat: int = 1,
+    time_limit: int = 1800,
     environment: Optional[str] = None,
     scope: Optional[str] = None,
     model: Optional[str] = None,
@@ -171,4 +173,6 @@ def load_workloads(
                 container_image = container_image or build_workload.spec.source.image
                 build_workload.spec.source.image = f"{container_image}:{container_tag}"
                 workloads.append(build_workload)
+        workload.spec.n_repeat = n_repeat
+        workload.spec.time_limit = time_limit
     return workloads
diff --git a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
index 3922de3f86..670072fc86 100644
--- a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
+++ b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
@@ -12,6 +12,8 @@
 @click.command()
 @click.option("--scope", required=True, type=str, help="Test scope")
 @click.option("--environment", required=True, type=str, help="LTS or dev features")
+@click.option("--n-repeat", required=False, default=1, type=int)
+@click.option("--time-limit", required=False, default=1, type=int)
 @click.option("--a100-cluster", required=True, type=str, help="A100 Cluster to run on")
 @click.option("--h100-cluster", required=True, type=str, help="H100 Cluster to run on")
 @click.option("--output-path", required=True, type=str, help="Path to write GitLab job to")
@@ -29,6 +31,8 @@
 def main(
     scope: str,
     environment: str,
+    n_repeat: int,
+    time_limit: int,
     a100_cluster: str,
     h100_cluster: str,
     output_path: str,
@@ -63,6 +67,8 @@ def main(
             "python tests/functional_tests/python_test_utils/jet/launch_jet_workload.py",
             f"--model {test_case.spec.model}",
             f"--environment {test_case.spec.environment}",
+            f"--n-repeat {n_repeat}",
+            f"--time-limit {time_limit}",
             f"--test-case {test_case.spec.test_case}",
             f"--container-tag {container_tag}",
             f"--cluster {cluster}",
diff --git a/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py b/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py
index 5ec4e84ae1..0418dd3937 100644
--- a/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py
+++ b/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py
@@ -42,6 +42,8 @@ def sigterm_handler(_signo, _stack_frame):
 def launch_and_wait_for_completion(
     test_case: str,
     environment: str,
+    n_repeat: int,
+    time_limit: int,
     container_image: str,
     container_tag: str,
     cluster: str,
@@ -54,6 +56,8 @@ def launch_and_wait_for_completion(
     ).workloads.submit(
         workloads=common.load_workloads(
             test_case=test_case,
+            n_repeat=n_repeat,
+            time_limit=time_limit,
             container_image=container_image,
             container_tag=container_tag,
             environment=environment,
@@ -142,6 +146,8 @@ def parse_finished_training(logs: List[str]) -> Optional[bool]:
 @click.option(
     "--environment", required=True, type=click.Choice(['dev', 'lts']), help="Pytorch LTS or DEV"
 )
+@click.option("--n-repeat", required=False, default=1, type=int)
+@click.option("--time-limit", required=False, default=1800, type=int)
 @click.option(
     "--account",
     required=False,
@@ -165,6 +171,8 @@ def main(
     model: str,
     test_case: str,
     environment: str,
+    n_repeat: int,
+    time_limit: int,
     account: str,
     cluster: str,
     container_tag: str,
@@ -195,6 +203,8 @@ def main(
         pipeline = launch_and_wait_for_completion(
             test_case=test_case,
             environment=environment,
+            n_repeat=n_repeat,
+            time_limit=time_limit,
             container_image=container_image,
             container_tag=container_tag,
             cluster=cluster,
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml
index 704fd1ce5a..d9268d02ec 100644
--- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml
@@ -5,7 +5,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml
index eaf288d30d..207acb5aa4 100644
--- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml
@@ -5,7 +5,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
index 7072374fab..a8fb420757 100644
--- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
@@ -5,7 +5,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml
index f3afb10fd5..10fbeb700e 100644
--- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml
@@ -5,7 +5,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/model_config.yaml
index 1e8f604797..991dfae683 100644
--- a/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/model_config.yaml
@@ -5,7 +5,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/model_config.yaml
index 66ab6cabfd..cfc4827a2e 100644
--- a/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/model_config.yaml
@@ -5,7 +5,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml
index 94d2f2feca..c3c70f8b0e 100644
--- a/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml
@@ -5,7 +5,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
index 2f6d24e945..9ffa49327d 100644
--- a/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
@@ -5,7 +5,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml
index cb94c9c91b..73ad47092d 100644
--- a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml
+++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml
@@ -5,7 +5,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024
@@ -43,4 +42,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
   --ckpt-format: torch
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/model_config.yaml
index 3dd071d3de..29fa50cab2 100644
--- a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/model_config.yaml
+++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/model_config.yaml
@@ -5,7 +5,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024
@@ -44,4 +43,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
   --ckpt-format: torch
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml
index 6d39266da3..d8fb0dc61f 100644
--- a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml
+++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml
@@ -5,7 +5,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024
@@ -43,4 +42,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
   --ckpt-format: torch
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml
index 989988f7cd..2d35954bf4 100644
--- a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml
+++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml
@@ -6,7 +6,6 @@ ENV_VARS:
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
   NVTE_APPLY_QK_LAYER_SCALING: 1
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024
@@ -46,4 +45,4 @@ MODEL_ARGS:
   --fp16: true
   --apply-query-key-layer-scaling: true
   --ckpt-format: torch
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml
index edcf75a772..abc650a5e2 100644
--- a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml
+++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml
@@ -6,7 +6,6 @@ ENV_VARS:
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
   NVTE_APPLY_QK_LAYER_SCALING: 1
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024
@@ -43,7 +42,7 @@ MODEL_ARGS:
   --deterministic-mode: true
   --no-gradient-accumulation-fusion: true
   --data-cache-path: ${DATA_CACHE_PATH}
-  --fp16: true  
+  --fp16: true
   --apply-query-key-layer-scaling: true
   --ckpt-format: torch
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml
index 5c92fbf7da..b9de9dc01f 100644
--- a/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml
@@ -3,52 +3,46 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: '1'
   NVTE_FLASH_ATTN: '0'
   NVTE_FUSED_ATTN: '0'
-
 TEST_TYPE: 'release'
-
 MODEL_ARGS:
   # Bert model args
-  --num-layers: 24 
-  --hidden-size: 1024 
-  --num-attention-heads: 16 
-  --seq-length: 512 
-  --max-position-embeddings: 512 
-
+  --num-layers: 24
+  --hidden-size: 1024
+  --num-attention-heads: 16
+  --seq-length: 512
+  --max-position-embeddings: 512
   # Training args
-  --micro-batch-size: 4 
-  --global-batch-size: 32 
-  --train-iters: 20000 
-  --weight-decay: 1e-2 
-  --clip-grad: 1.0 
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --train-iters: 20000
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
   --fp16: true
   --lr: 0.0001
-  --lr-decay-style: linear 
-  --min-lr: 1.0e-5 
-  --lr-warmup-fraction: .01 
+  --lr-decay-style: linear
+  --min-lr: 1.0e-5
+  --lr-warmup-fraction: .01
   --bert-no-binary-head: true
-
   # Model parallel
-  --tensor-model-parallel-size: 8 
-  --pipeline-model-parallel-size: 8 
-
+  --tensor-model-parallel-size: 8
+  --pipeline-model-parallel-size: 8
   # Data args
   --data-path: ${DATA_BLEND}
-  --vocab-file: ${DATA_PATH}/vocab.txt 
+  --vocab-file: ${DATA_PATH}/vocab.txt
   --split: 949,50,1
   --data-cache-path: ${DATA_CACHE_PATH}
-
   # EVAL_AND_LOGGING_ARGS
   --log-interval: 100
   --save-interval: 2000
-  --eval-interval: 1000 
+  --eval-interval: 1000
   --save: ${CHECKPOINT_PATH}
   --load: ${CHECKPOINT_PATH}
   --eval-iters: 10
-  --tensorboard-dir: ${TENSORBOARD_PATH} 
+  --tensorboard-dir: ${TENSORBOARD_PATH}
   --log-timers-to-tensorboard: true
   --log-memory-to-tensorboard: true
   --log-num-zeros-in-grad: true
   --log-params-norm: true
   --log-validation-ppl-to-tensorboard: true
   --wandb-project: megatron-core-release-runs
-  --wandb-exp-name: ${WANDB_EXPERIMENT}
\ No newline at end of file
+  --wandb-exp-name: ${WANDB_EXPERIMENT}
diff --git a/tests/functional_tests/test_cases/common/ckpt_converter/model_config.yaml b/tests/functional_tests/test_cases/common/ckpt_converter/model_config.yaml
index bffa64bc52..2ac5db1147 100644
--- a/tests/functional_tests/test_cases/common/ckpt_converter/model_config.yaml
+++ b/tests/functional_tests/test_cases/common/ckpt_converter/model_config.yaml
@@ -3,6 +3,5 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
 TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G/model_config.yaml
index 89c71f6291..51dbdfd67b 100644
--- a/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G/model_config.yaml
@@ -1,7 +1,6 @@
 ENV_VARS:
   CUDA_DEVICE_MAX_CONNECTIONS: 1
   SKIP_PYTEST: 1
-  N_REPEATS: 1
 MODEL_ARGS:
   trainer.num_nodes: 1
   trainer.devices: 8
diff --git a/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml
index d7e926e96e..a48bfeae7f 100644
--- a/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml
@@ -1,7 +1,6 @@
 ENV_VARS:
   CUDA_DEVICE_MAX_CONNECTIONS: 1
   SKIP_PYTEST: 1
-  N_REPEATS: 1
 MODEL_ARGS:
   trainer.num_nodes: 1
   trainer.devices: 8
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml
index bf88792152..89bc2ae8b6 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml
@@ -6,9 +6,7 @@ ENV_VARS:
   NVTE_BWD_LAYERNORM_SM_MARGIN: 16
   NCCL_P2P_NET_CHUNKSIZE: 2097152
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
-
 TEST_TYPE: "release"
-
 MODEL_ARGS:
   # Distributed args
   --distributed-timeout-minutes: 60
@@ -17,7 +15,6 @@ MODEL_ARGS:
   --use-distributed-optimizer: true
   --overlap-grad-reduce: true
   --overlap-param-gather: true
-
   # Training args
   --use-mcore-models: true
   --sequence-parallel: true
@@ -27,10 +24,8 @@ MODEL_ARGS:
   --global-batch-size: 1152
   --train-samples: 19531250
   --manual-gc: true
-
   # Transformer Engine args
   --transformer-impl: transformer_engine
-
   # Data args
   --data-cache-path: ${DATA_CACHE_PATH}
   --tokenizer-type: GPTSentencePieceTokenizer
@@ -39,7 +34,6 @@ MODEL_ARGS:
   --split: 99,1,0
   --no-mmap-bin-files: true
   --num-workers: 6
-
   # Add network size args
   --apply-layernorm-1p: true
   --untie-embeddings-and-output-weights: true
@@ -54,13 +48,11 @@ MODEL_ARGS:
   --num-query-groups: 8
   --seq-length: 4096
   --max-position-embeddings: 4096
-
   # Add regularization args
   --attention-dropout: 0.0
   --hidden-dropout: 0.0
   --clip-grad: 1.0
   --weight-decay: 0.1
-
   # Add learning rate args
   --lr-decay-samples: 1949218748
   --lr-warmup-samples: 3906252
@@ -71,19 +63,15 @@ MODEL_ARGS:
   --lr-decay-style: cosine
   --adam-beta1: 0.9
   --adam-beta2: 0.95
-
   # Add validation args
   --eval-iters: 32
   --eval-interval: 2000
-
   # Add checkpointing args
   --load: ${OUTPUT_PATH}/checkpoints
   --save: ${OUTPUT_PATH}/checkpoints
   --save-interval: 500
-
   # Add initialization args
   --init-method-std: 0.0134
-
   # Add logging args
   --log-timers-to-tensorboard: true
   --log-memory-to-tensorboard: true
@@ -95,6 +83,5 @@ MODEL_ARGS:
   --tensorboard-dir: ${OUTPUT_PATH}/tensorboard
   --wandb-project: megatron-core-release-runs
   --wandb-exp-name: ${WANDB_EXPERIMENT}
-
   # Add mixed precision args
-  --bf16: true
\ No newline at end of file
+  --bf16: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml
index 9453db100c..b279c96f05 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml
@@ -6,9 +6,7 @@ ENV_VARS:
   NVTE_BWD_LAYERNORM_SM_MARGIN: 16
   NCCL_P2P_NET_CHUNKSIZE: 2097152
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
-
 TEST_TYPE: "release"
-
 MODEL_ARGS:
   # Distributed args
   --distributed-timeout-minutes: 60
@@ -17,7 +15,6 @@ MODEL_ARGS:
   --use-distributed-optimizer: true
   --overlap-grad-reduce: true
   --overlap-param-gather: true
-
   # Training args
   --use-mcore-models: true
   --sequence-parallel: true
@@ -27,10 +24,8 @@ MODEL_ARGS:
   --global-batch-size: 1152
   --train-samples: 4882812
   --manual-gc: true
-
   # Transformer Engine args
   --transformer-impl: transformer_engine
-
   # Data args
   --data-cache-path: ${DATA_CACHE_PATH}
   --tokenizer-type: GPTSentencePieceTokenizer
@@ -39,7 +34,6 @@ MODEL_ARGS:
   --split: 99,1,0
   --no-mmap-bin-files: true
   --num-workers: 6
-
   # Add network size args
   --apply-layernorm-1p: true
   --untie-embeddings-and-output-weights: true
@@ -54,13 +48,11 @@ MODEL_ARGS:
   --num-query-groups: 8
   --seq-length: 4096
   --max-position-embeddings: 4096
-
   # Add regularization args
   --attention-dropout: 0.0
   --hidden-dropout: 0.0
   --clip-grad: 1.0
   --weight-decay: 0.1
-
   # Add learning rate args
   --lr-decay-samples: 1949218748
   --lr-warmup-samples: 3906252
@@ -71,19 +63,15 @@ MODEL_ARGS:
   --lr-decay-style: cosine
   --adam-beta1: 0.9
   --adam-beta2: 0.95
-
   # Add validation args
   --eval-iters: 32
   --eval-interval: 2000
-
   # Add checkpointing args
   --load: ${OUTPUT_PATH}/checkpoints
   --save: ${OUTPUT_PATH}/checkpoints
   --save-interval: 500
-
   # Add initialization args
   --init-method-std: 0.0134
-
   # Add logging args
   --log-timers-to-tensorboard: true
   --log-memory-to-tensorboard: true
@@ -95,6 +83,5 @@ MODEL_ARGS:
   --tensorboard-dir: ${OUTPUT_PATH}/tensorboard
   --wandb-project: megatron-core-release-runs
   --wandb-exp-name: ${WANDB_EXPERIMENT}
-
   # Add mixed precision args
-  --bf16: true
\ No newline at end of file
+  --bf16: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml
index 459270a1b2..69ad59f080 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -50,4 +49,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml
index dcb80dc007..fd1e7253c9 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -51,4 +50,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --fp16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: ckpt-resume
\ No newline at end of file
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml
index d94f5277d4..2b94108731 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -47,4 +46,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml
index 9f210d838f..d9ed9c7602 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -47,4 +46,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --fp16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml
index b943bfec0f..abb85baa55 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -48,4 +47,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --fp16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: ckpt-resume
\ No newline at end of file
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml
index 108cb6b1a4..e40b6f61ee 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -47,4 +46,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml
index 1c2a42eaaa..a2960f3a37 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -48,4 +47,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --fp16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: ckpt-resume
\ No newline at end of file
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml
index cb0214f264..6beae45b8a 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -52,4 +51,4 @@ MODEL_ARGS:
   --ckpt-format: torch_dist
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
-TEST_TYPE: ckpt-resume
\ No newline at end of file
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml
index 97d3d8c5f0..d50c59d5f6 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -51,4 +50,4 @@ MODEL_ARGS:
   --ckpt-format: torch_dist
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml
index 1a15825731..2b01cfa62f 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -52,4 +51,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --fp16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: ckpt-resume
\ No newline at end of file
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts/model_config.yaml
index c6728722e2..a74327d67f 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -51,4 +50,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml
index 37cc4615a5..267a290a59 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -47,4 +46,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml
index 528b691a28..77c55fac92 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -47,4 +46,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --fp16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: ckpt-resume
\ No newline at end of file
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml
index 4f5e8d93b7..d5d4413669 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -48,4 +47,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --fp16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: ckpt-resume
\ No newline at end of file
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/model_config.yaml
index 64d504bf29..7fac1317c4 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -49,4 +48,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --fp16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/model_config.yaml
index 190e5777f2..2c05343a10 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -48,4 +47,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --fp16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml
index 99d0ac8f6b..2d4f4d2a15 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -47,4 +46,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --fp16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/model_config.yaml
index 6242b2ebbc..05eb509e6b 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -48,4 +47,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --fp16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: ckpt-resume
\ No newline at end of file
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/model_config.yaml
index 81727e052d..4b1288dbe2 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -47,4 +46,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --fp16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/model_config.yaml
index 525d0f2c90..d55fb7510c 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -48,4 +47,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --fp16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/model_config.yaml
index 516e1dd517..c0aceac272 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -48,4 +47,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --fp16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: ckpt-resume
\ No newline at end of file
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/model_config.yaml
index 10fc8c2f23..c2439f9f36 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -49,4 +48,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --fp16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts/model_config.yaml
index ba219d4445..4c3a4fb095 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -51,4 +50,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --fp16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/model_config.yaml
index c547f47970..69dc9edf52 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -48,4 +47,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --fp16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/model_config.yaml
index 72c98e80be..bd324b8ba1 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -52,4 +51,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --fp16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: ckpt-resume
\ No newline at end of file
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/model_config.yaml
index 03ddd8a7ca..e8723049fb 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -49,4 +48,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --fp16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: ckpt-resume
\ No newline at end of file
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml
index 84128fa780..226809ade0 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -47,4 +46,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --fp16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/model_config.yaml
index b664115f27..8746c03a36 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -48,4 +47,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --fp16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/model_config.yaml
index 0ec5d88ad9..7d0be91444 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
@@ -48,4 +47,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --fp16: true
   --apply-query-key-layer-scaling: true
-TEST_TYPE: ckpt-resume
\ No newline at end of file
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
index ee84d93de2..c9de15222e 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml
index ffdaec80ad..90c257012f 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
index 9dd9e9ecd0..fcaad99320 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml
index 470ba6f926..1741647355 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml
index fb07f9d30c..b51ada7c08 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml
index 7cdb56dd00..2d2c1ce9a0 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml
@@ -1,7 +1,6 @@
 ENV_VARS:
   CUDA_DEVICE_MAX_CONNECTIONS: 1
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml
index 7bdd0c46e2..7689c48dcc 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml
index b014fdabc0..40f43682b7 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml
index b2a1643ec8..ecc4c7fa76 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml
index 6c2c9e51ab..65a87d67a1 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
index 2e0188551a..f3e4ce8a6f 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
index 8fa10f4b9d..440638b53d 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml
index c64a4ef5e7..059716a6a3 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml
index dda1876e1a..f82a51e4f3 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml
index df7ba9fb3b..3d4dc222a4 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml
index 479916c654..3e5acc65a0 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml
index 20c57f0c95..9ae648b7bf 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml
index f7c52c997f..85e8e81ff3 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml
index 210febf448..fea891cd94 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml
index fd67df60ca..b096c06b6c 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml
index 0c0bc85f61..a2c641b31d 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
index 7a92bfd8cd..2b9346ee7e 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
index ef5b64d284..61adccbb97 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml
index ca1de0ad37..023747a480 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml
index 30137a040d..e573b90971 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml
index 1513a18192..c31e5b66b3 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml
index 077c9a36e8..9b02b473bd 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
index 1ccbe1ae31..d98716ac4d 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
index b9ca819495..92b2e3528a 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
index 25ea6c933b..1f2fa9e2dc 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml
index 7b7bc27f4b..49865dde85 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/model_config.yaml
index 059265a079..bdb6ab3081 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/model_config.yaml
@@ -49,4 +49,4 @@ MODEL_ARGS:
   --bf16: true
   --decoder-first-pipeline-num-layers: 2
   --decoder-last-pipeline-num-layers: 2
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml
index 7da0cc5ddd..01c7ffc2f1 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml
@@ -1,7 +1,6 @@
 ENV_VARS:
   CUDA_DEVICE_MAX_CONNECTIONS: 1
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml
index 476a1b6b93..2cc6bd5c6f 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml
@@ -1,7 +1,6 @@
 ENV_VARS:
   CUDA_DEVICE_MAX_CONNECTIONS: 1
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml
index 613559a96e..95f6e35591 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml
index a1f86a64c7..edc9eed73d 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml
index 6c454ecca7..b12ef70b9e 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml
index cf4a90e410..5246a6ecf1 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml
index 793bfb21d4..46a56c1090 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml
index 29b87e9073..3d4d717349 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml
index c4b791a9d4..be3e678db6 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml
index c2631e84e0..a2fb0f51af 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml
index bc5da0c312..f3da93728f 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml
index 7c437e0b10..91e9e836c0 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml
index dde8a620d3..5630ddd719 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml
@@ -1,7 +1,6 @@
 ENV_VARS:
   CUDA_DEVICE_MAX_CONNECTIONS: 1
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml
index 303182bcaf..8f0bf337b9 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml
@@ -1,7 +1,6 @@
 ENV_VARS:
   CUDA_DEVICE_MAX_CONNECTIONS: 1
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml
index c08ce2e01c..31544968ff 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml
index 959c286a50..75a485403a 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
index c9938b5ee1..9b5deed4cb 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml
index 23060e55e4..693a2d39f9 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
index 32bd642deb..3aa23b39a4 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml
index 7d64cf477f..4a8a6abdd0 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml
@@ -1,7 +1,6 @@
 ENV_VARS:
   CUDA_DEVICE_MAX_CONNECTIONS: 1
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml
index 6014052dd6..95f706d04a 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml
@@ -1,7 +1,6 @@
 ENV_VARS:
   CUDA_DEVICE_MAX_CONNECTIONS: 1
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml
index 6d8a590974..e74a0cc992 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml
index c304692d62..f041fd4ac7 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
index d8f1585ae2..e683475ffd 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml
index c02d1fdc67..1b416d029a 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
index 7d5b13b753..4f922838b3 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml
index cff824669b..bdb039ffda 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml
@@ -1,7 +1,6 @@
 ENV_VARS:
   CUDA_DEVICE_MAX_CONNECTIONS: 1
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
index 8846dacb40..b56afa8e52 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
index 9295cdc580..f482eda5e6 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml
index b8f1667cdb..43224c5849 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
index d2888f767c..dda321f572 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
index 27acfbee86..93e1ce6463 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml
index 1ea30bae73..6418b0c5d2 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml
index f3348d608d..a5de201786 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml
@@ -1,7 +1,6 @@
 ENV_VARS:
   CUDA_DEVICE_MAX_CONNECTIONS: 1
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml
index fbb767cb14..226dfbc6b6 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml
index cf65df920f..168da23f9b 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml
@@ -4,7 +4,6 @@ ENV_VARS:
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
   SKIP_PYTEST: 1
-  N_REPEATS: 1
 BEFORE_SCRIPT: pip uninstall -y transformer_engine pip uninstall -y Apex ## TODO: remove once Apex dependency has been removed completely
 MODEL_ARGS:
   --num-layers: 12
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
index af105662a9..56d76fa39e 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
index 3d27f95aa6..52b0887e00 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml
index 1e6b07a429..0923fd41f1 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G/model_config.yaml
index 2ff5fc2224..9ea57cb3ac 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml
index 4e4a963417..ea96682fe4 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
index 8d11e207e7..beaaa986ab 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 512
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml
index 9516076dc6..9f913d089f 100644
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml
@@ -4,9 +4,7 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
   PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
   NCCL_NVLS_ENABLE: 0
-
 TEST_TYPE: "release"
-
 MODEL_ARGS:
   # Distributed args
   --distributed-timeout-minutes: 60
@@ -15,7 +13,6 @@ MODEL_ARGS:
   --use-distributed-optimizer: true
   --overlap-grad-reduce: true
   --overlap-param-gather: true
-
   # Training args
   --use-mcore-models: true
   --sequence-parallel: true
@@ -25,10 +22,8 @@ MODEL_ARGS:
   --global-batch-size: 256
   --train-samples: 38400
   --exit-duration-in-mins: 230
-
   # Transformer Engine args
   --transformer-impl: transformer_engine
-
   # Data args
   --data-cache-path: ${DATA_CACHE_PATH}
   --tokenizer-type: Llama2Tokenizer
@@ -37,7 +32,6 @@ MODEL_ARGS:
   --split: 99,1,0
   --no-mmap-bin-files: true
   --num-workers: 6
-
   # Add network size args
   --untie-embeddings-and-output-weights: true
   --no-position-embedding: true
@@ -54,13 +48,11 @@ MODEL_ARGS:
   --seq-length: 4096
   --max-position-embeddings: 4096
   --make-vocab-size-divisible-by: 128
-
   # Add regularization args
   --attention-dropout: 0.0
   --hidden-dropout: 0.0
   --clip-grad: 1.0
   --weight-decay: 0.1
-
   # Add learning rate args
   --lr-decay-samples: 255126953
   --lr-warmup-samples: 162761
@@ -69,7 +61,6 @@ MODEL_ARGS:
   --lr-decay-style: cosine
   --adam-beta1: 0.9
   --adam-beta2: 0.95
-
   # Add MoE args
   --expert-model-parallel-size: 8
   --num-experts: 8
@@ -78,11 +69,9 @@ MODEL_ARGS:
   --moe-grouped-gemm: true
   --moe-aux-loss-coeff: 1e-2
   --moe-token-dispatcher-type: alltoall
-
   # Add validation args
   --eval-iters: 32
   --eval-interval: 500
-
   # Add checkpointing args
   --finetune: true
   --auto-detect-ckpt-format: true
@@ -90,10 +79,8 @@ MODEL_ARGS:
   --save: ${OUTPUT_PATH}/checkpoints
   --no-ckpt-fully-parallel-save: true
   --save-interval: 500
-
   # Add initialization args
   --init-method-std: 0.008
-
   # Add logging args
   --log-timers-to-tensorboard: true
   --log-memory-to-tensorboard: true
@@ -105,6 +92,5 @@ MODEL_ARGS:
   --tensorboard-dir: ${OUTPUT_PATH}/tensorboard
   --wandb-project: megatron-core-release-runs
   --wandb-exp-name: ${WANDB_EXPERIMENT}
-
   # Add mixed precision args
   --bf16: true
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml
index 585d9bb2c7..fa483b8770 100644
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml
@@ -6,9 +6,7 @@ ENV_VARS:
   NVTE_BWD_LAYERNORM_SM_MARGIN: 16
   NCCL_P2P_NET_CHUNKSIZE: 2097152
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
-
 TEST_TYPE: "release"
-
 MODEL_ARGS:
   # Distributed args
   --distributed-timeout-minutes: 60
@@ -18,7 +16,6 @@ MODEL_ARGS:
   --overlap-grad-reduce: true
   --overlap-param-gather: true
   --no-ckpt-fully-parallel-save: true
-  
   # Training args
   --use-mcore-models: true
   --sequence-parallel: true
@@ -28,19 +25,16 @@ MODEL_ARGS:
   --global-batch-size: 1024
   --train-samples: 24414063
   --exit-duration-in-mins: 230
-
   # Transformer Engine args
   --transformer-impl: transformer_engine
-
   # Data args
   --data-cache-path: ${DATA_CACHE_PATH}
   --tokenizer-type: GPTSentencePieceTokenizer
-  --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model 
+  --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model
   --data-path: $DATA_BLEND
   --split: 99,1,0
   --no-mmap-bin-files: true
   --num-workers: 6
-
   # Add network size args
   --untie-embeddings-and-output-weights: true
   --no-position-embedding: true
@@ -57,13 +51,11 @@ MODEL_ARGS:
   --seq-length: 4096
   --max-position-embeddings: 4096
   --make-vocab-size-divisible-by: 128
-
   # Add regularization args
   --attention-dropout: 0.0
   --hidden-dropout: 0.0
   --clip-grad: 1.0
   --weight-decay: 0.1
-
   # Add learning rate args
   --lr-decay-samples: 1949218748
   --lr-warmup-samples: 3906252
@@ -72,7 +64,6 @@ MODEL_ARGS:
   --lr-decay-style: cosine
   --adam-beta1: 0.9
   --adam-beta2: 0.95
-
   # Add MoE args
   --expert-model-parallel-size: 4
   --num-experts: 8
@@ -81,19 +72,15 @@ MODEL_ARGS:
   --moe-grouped-gemm: true
   --moe-aux-loss-coeff: 1e-2
   --moe-token-dispatcher-type: alltoall
-
   # Add validation args
   --eval-iters: 32
   --eval-interval: 200
-
   # Add checkpointing args
   --load: ${OUTPUT_PATH}/checkpoints
   --save: ${OUTPUT_PATH}/checkpoints
   --save-interval: 500
-
   # Add initialization args
   --init-method-std: 0.010
-
   # Add logging args
   --log-timers-to-tensorboard: true
   --log-memory-to-tensorboard: true
@@ -105,6 +92,5 @@ MODEL_ARGS:
   --tensorboard-dir: ${OUTPUT_PATH}/tensorboard
   --wandb-project: megatron-core-release-runs
   --wandb-exp-name: ${WANDB_EXPERIMENT}
-
   # Add mixed precision args
   --bf16: true
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml
index 22607416a3..969e9f17e6 100644
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml
@@ -6,9 +6,7 @@ ENV_VARS:
   NVTE_BWD_LAYERNORM_SM_MARGIN: 16
   NCCL_P2P_NET_CHUNKSIZE: 2097152
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
-
 TEST_TYPE: "release"
-
 MODEL_ARGS:
   # Distributed args
   --distributed-timeout-minutes: 60
@@ -18,7 +16,6 @@ MODEL_ARGS:
   --overlap-grad-reduce: true
   --overlap-param-gather: true
   --no-ckpt-fully-parallel-save: true
-  
   # Training args
   --use-mcore-models: true
   --sequence-parallel: true
@@ -28,19 +25,16 @@ MODEL_ARGS:
   --global-batch-size: 1024
   --train-samples: 6103515
   --exit-duration-in-mins: 230
-
   # Transformer Engine args
   --transformer-impl: transformer_engine
-
   # Data args
   --data-cache-path: ${DATA_CACHE_PATH}
   --tokenizer-type: GPTSentencePieceTokenizer
-  --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model 
+  --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model
   --data-path: $DATA_BLEND
   --split: 99,1,0
   --no-mmap-bin-files: true
   --num-workers: 6
-
   # Add network size args
   --untie-embeddings-and-output-weights: true
   --no-position-embedding: true
@@ -57,13 +51,11 @@ MODEL_ARGS:
   --seq-length: 4096
   --max-position-embeddings: 4096
   --make-vocab-size-divisible-by: 128
-
   # Add regularization args
   --attention-dropout: 0.0
   --hidden-dropout: 0.0
   --clip-grad: 1.0
   --weight-decay: 0.1
-
   # Add learning rate args
   --lr-decay-samples: 1949218748
   --lr-warmup-samples: 3906252
@@ -72,7 +64,6 @@ MODEL_ARGS:
   --lr-decay-style: cosine
   --adam-beta1: 0.9
   --adam-beta2: 0.95
-
   # Add MoE args
   --expert-model-parallel-size: 4
   --num-experts: 8
@@ -81,19 +72,15 @@ MODEL_ARGS:
   --moe-grouped-gemm: true
   --moe-aux-loss-coeff: 1e-2
   --moe-token-dispatcher-type: alltoall
-
   # Add validation args
   --eval-iters: 32
   --eval-interval: 200
-
   # Add checkpointing args
   --load: ${OUTPUT_PATH}/checkpoints
   --save: ${OUTPUT_PATH}/checkpoints
   --save-interval: 500
-
   # Add initialization args
   --init-method-std: 0.010
-
   # Add logging args
   --log-timers-to-tensorboard: true
   --log-memory-to-tensorboard: true
@@ -105,6 +92,5 @@ MODEL_ARGS:
   --tensorboard-dir: ${OUTPUT_PATH}/tensorboard
   --wandb-project: megatron-core-release-runs
   --wandb-exp-name: ${WANDB_EXPERIMENT}
-
   # Add mixed precision args
   --bf16: true
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml
index 39421a887e..33593ffca7 100644
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml
@@ -4,9 +4,7 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
   PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
   NCCL_NVLS_ENABLE: 0
-
 TEST_TYPE: "release"
-
 MODEL_ARGS:
   # Distributed args
   --distributed-timeout-minutes: 60
@@ -16,7 +14,6 @@ MODEL_ARGS:
   --use-distributed-optimizer: true
   --overlap-grad-reduce: true
   --overlap-param-gather: true
-
   # Training args
   --use-mcore-models: true
   --sequence-parallel: true
@@ -26,10 +23,8 @@ MODEL_ARGS:
   --global-batch-size: 256
   --train-samples: 51200
   --exit-duration-in-mins: 230
-
   # Transformer Engine args
   --transformer-impl: transformer_engine
-
   # Data args
   --data-cache-path: ${DATA_CACHE_PATH}
   --tokenizer-type: Llama2Tokenizer
@@ -38,7 +33,6 @@ MODEL_ARGS:
   --split: 99,1,0
   --no-mmap-bin-files: true
   --num-workers: 6
-
   # Add network size args
   --untie-embeddings-and-output-weights: true
   --no-position-embedding: true
@@ -55,13 +49,11 @@ MODEL_ARGS:
   --seq-length: 4096
   --max-position-embeddings: 4096
   --make-vocab-size-divisible-by: 128
-
   # Add regularization args
   --attention-dropout: 0.0
   --hidden-dropout: 0.0
   --clip-grad: 1.0
   --weight-decay: 0.1
-
   # Add learning rate args
   --lr-decay-samples: 255126953
   --lr-warmup-samples: 162761
@@ -70,7 +62,6 @@ MODEL_ARGS:
   --lr-decay-style: cosine
   --adam-beta1: 0.9
   --adam-beta2: 0.95
-
   # Add MoE args
   --expert-model-parallel-size: 8
   --num-experts: 8
@@ -79,11 +70,9 @@ MODEL_ARGS:
   --moe-grouped-gemm: true
   --moe-aux-loss-coeff: 1e-2
   --moe-token-dispatcher-type: alltoall
-
   # Add validation args
   --eval-iters: 32
   --eval-interval: 200
-
   # Add checkpointing args
   --finetune: true
   --auto-detect-ckpt-format: true
@@ -91,10 +80,8 @@ MODEL_ARGS:
   --save: ${OUTPUT_PATH}/checkpoints
   --no-ckpt-fully-parallel-save: true
   --save-interval: 500
-
   # Add initialization args
   --init-method-std: 0.008
-
   # Add logging args
   --log-timers-to-tensorboard: true
   --log-memory-to-tensorboard: true
@@ -106,6 +93,5 @@ MODEL_ARGS:
   --tensorboard-dir: ${OUTPUT_PATH}/tensorboard
   --wandb-project: megatron-core-release-runs
   --wandb-exp-name: ${WANDB_EXPERIMENT}
-
   # Add mixed precision args
   --bf16: true
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml
index 6da0c3a85a..b3b81d5033 100644
--- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 624
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/model_config.yaml
index 816aa8bf1f..cdfdac5ffe 100644
--- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 624
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/model_config.yaml b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/model_config.yaml
index 180e6beedd..22f816cd89 100644
--- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/model_config.yaml
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/model_config.yaml
@@ -4,7 +4,6 @@ ENV_VARS:
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
   GPUS_PER_NODE: 7
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 624
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G/model_config.yaml b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G/model_config.yaml
index 1fade8fd4e..4a829aca1d 100644
--- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G/model_config.yaml
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G/model_config.yaml
@@ -4,7 +4,6 @@ ENV_VARS:
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
   GPUS_PER_NODE: 7
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 12
   --hidden-size: 624
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
index 076389c3d6..e781e0980b 100644
--- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: ^NVLS
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --encoder-num-layers: 12
   --decoder-num-layers: 12
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
index b0d00b8f83..33daffa1e1 100644
--- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: ^NVLS
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --encoder-num-layers: 12
   --decoder-num-layers: 12
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml
index d1b9e8429e..ac40afa88a 100644
--- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: ^NVLS
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --encoder-num-layers: 12
   --decoder-num-layers: 12
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
index 540d4c1b73..7a1690768a 100644
--- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: ^NVLS
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --encoder-num-layers: 12
   --decoder-num-layers: 12
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml
index 8abace27d3..2df13fd07b 100644
--- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: ^NVLS
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --encoder-num-layers: 12
   --decoder-num-layers: 12
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
index c1a6d51bf1..23f9be2841 100644
--- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: ^NVLS
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --encoder-num-layers: 12
   --decoder-num-layers: 12
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml
index 6aae44ca71..3f19d3a3f1 100644
--- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: ^NVLS
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --encoder-num-layers: 12
   --decoder-num-layers: 12
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
index 6e9731d4ce..243e1fc052 100644
--- a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
@@ -3,7 +3,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: ^NVLS
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --encoder-num-layers: 12
   --decoder-num-layers: 12
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml
index 6556baeb59..798f00c902 100644
--- a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml
+++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml
@@ -51,4 +51,4 @@ MODEL_ARGS:
   --deterministic-mode: true
   --attention-softmax-in-fp32: true
   --ckpt-format: torch
-TEST_TYPE: ckpt-resume
\ No newline at end of file
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/model_config.yaml
index 70077b84a9..df56656bd6 100644
--- a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/model_config.yaml
+++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/model_config.yaml
@@ -51,4 +51,4 @@ MODEL_ARGS:
   --deterministic-mode: true
   --attention-softmax-in-fp32: true
   --ckpt-format: torch
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml
index 3a1793957b..940b85cfab 100644
--- a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml
+++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml
@@ -52,4 +52,4 @@ MODEL_ARGS:
   --deterministic-mode: true
   --attention-softmax-in-fp32: true
   --ckpt-format: torch
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/model_config.yaml
index 233023af31..a05129f539 100644
--- a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/model_config.yaml
+++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/model_config.yaml
@@ -50,4 +50,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --deterministic-mode: true
   --ckpt-format: torch
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/model_config.yaml
index 43afd73364..91c6e2e220 100644
--- a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/model_config.yaml
+++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/model_config.yaml
@@ -50,4 +50,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --deterministic-mode: true
   --ckpt-format: torch
-TEST_TYPE: ckpt-resume
\ No newline at end of file
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/model_config.yaml
index 47ff5b038b..cf95759fc5 100644
--- a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/model_config.yaml
+++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/model_config.yaml
@@ -50,4 +50,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --deterministic-mode: true
   --ckpt-format: torch
-TEST_TYPE: regular
\ No newline at end of file
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml
index 64784c36a6..5cc9a2e0d6 100644
--- a/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml
@@ -3,44 +3,38 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: '1'
   NVTE_FLASH_ATTN: '0'
   NVTE_FUSED_ATTN: '0'
-
 TEST_TYPE: 'release'
-
 MODEL_ARGS:
   # T5 model args
   --encoder-num-layers: 12
   --decoder-num-layers: 12
   --hidden-size: 768
-  --num-attention-heads: 12 
+  --num-attention-heads: 12
   --kv-channels: 64
   --ffn-hidden-size: 3072
   --encoder-seq-length: 512
   --decoder-seq-length: 128
-  --max-position-embeddings: 512 
+  --max-position-embeddings: 512
   --init-method-std: 0.015
-
   # Training args
-  --micro-batch-size: 32 
-  --global-batch-size: 512 
-  --train-iters: 100000 
-  --weight-decay: 1e-2 
-  --clip-grad: 1.0 
+  --micro-batch-size: 32
+  --global-batch-size: 512
+  --train-iters: 100000
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
   --bf16: true
   --lr: 0.0001
-  --lr-decay-style: linear 
-  --min-lr: 1.0e-5 
-  --lr-warmup-fraction: .01 
+  --lr-decay-style: linear
+  --min-lr: 1.0e-5
+  --lr-warmup-fraction: .01
   --distributed-backend: nccl
-
   # Transformer Engine args
   --use-mcore-models: true
   --transformer-impl: transformer_engine
-
   # Model parallel
   --tensor-model-parallel-size: 4
-  --pipeline-model-parallel-size: 1 
+  --pipeline-model-parallel-size: 1
   --encoder-pipeline-model-parallel-size: 0
-
   # Data args
   --data-path: ${DATA_BLEND}
   --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
@@ -48,11 +42,10 @@ MODEL_ARGS:
   --split: 99982,9,9
   --data-cache-path: ${DATA_CACHE_PATH}
   --vocab-extra-ids: 100
-
   # EVAL_AND_LOGGING_ARGS
   --log-interval: 100
   --save-interval: 2000
-  --eval-interval: 1000 
+  --eval-interval: 1000
   --save: ${CHECKPOINT_PATH}
   --load: ${CHECKPOINT_PATH}
   --eval-iters: 10
@@ -64,4 +57,4 @@ MODEL_ARGS:
   --log-validation-ppl-to-tensorboard: true
   --timing-log-level: 2
   --wandb-project: megatron-core-release-runs
-  --wandb-exp-name: ${WANDB_EXPERIMENT}
\ No newline at end of file
+  --wandb-exp-name: ${WANDB_EXPERIMENT}
diff --git a/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py b/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py
index fa00a20cad..be7621d7f4 100644
--- a/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py
+++ b/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py
@@ -33,6 +33,7 @@ def teardown_method(self, method):
         ('src_tp_pp', 'dest_tp_pp'),
         [((2, 4), (2, 4)), ((2, 4), (2, 2)), ((2, 4), (4, 2)), ((8, 1), (1, 2))],
     )
+    @pytest.mark.flaky
     def test_partition_change_save_load(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp):
         Utils.initialize_model_parallel(*src_tp_pp)
         with TempNamedDir(

From 6e05f339f2ebbcd7369f62dd963809ec880b5420 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Sat, 26 Oct 2024 09:15:53 -0700
Subject: [PATCH 4/8] ADLR/megatron-lm!2268 - ci: Fix defaults

---
 .gitlab-ci.yml                                         | 8 ++++----
 tests/unit_tests/dist_checkpointing/test_async_save.py | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 06334601b4..a93e1cb615 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -18,7 +18,7 @@ workflow:
         FUNCTIONAL_TEST: "yes"
         FUNCTIONAL_TEST_SCOPE: mr
         FUNCTIONAL_TEST_REPEAT: 5
-        FUNCTIONAL_TEST_TIME_LIMIT: 1800,
+        FUNCTIONAL_TEST_TIME_LIMIT: 1800
         FUNCTIONAL_TEST_CLUSTER_A100: ""
         FUNCTIONAL_TEST_CLUSTER_H100: ""
         PUBLISH: "no"
@@ -29,7 +29,7 @@ workflow:
         FUNCTIONAL_TEST: "yes"
         FUNCTIONAL_TEST_SCOPE: nightly
         FUNCTIONAL_TEST_REPEAT: 5
-        FUNCTIONAL_TEST_TIME_LIMIT: 1800,
+        FUNCTIONAL_TEST_TIME_LIMIT: 1800
         FUNCTIONAL_TEST_CLUSTER_A100: ""
         FUNCTIONAL_TEST_CLUSTER_H100: ""
         PUBLISH: "no"
@@ -39,8 +39,8 @@ workflow:
         UNIT_TEST_TIMEOUT: 75
         FUNCTIONAL_TEST: "yes"
         FUNCTIONAL_TEST_SCOPE: weekly
-        FUNCTIONAL_TEST_REPEAT: 1,
-        FUNCTIONAL_TEST_TIME_LIMIT: 9000,
+        FUNCTIONAL_TEST_REPEAT: 1
+        FUNCTIONAL_TEST_TIME_LIMIT: 9000
         FUNCTIONAL_TEST_CLUSTER_A100: ""
         FUNCTIONAL_TEST_CLUSTER_H100: ""
         PUBLISH: "no"
diff --git a/tests/unit_tests/dist_checkpointing/test_async_save.py b/tests/unit_tests/dist_checkpointing/test_async_save.py
index d6aa879982..d50aea30e2 100644
--- a/tests/unit_tests/dist_checkpointing/test_async_save.py
+++ b/tests/unit_tests/dist_checkpointing/test_async_save.py
@@ -71,6 +71,7 @@ def test_async_is_equivalent_to_sync(self, tmp_path_dist_ckpt):
 
     @pytest.mark.parametrize('async_save', [False, True])
     @pytest.mark.parametrize('worker_fn', [write_data_os_err_mock_fn])
+    @pytest.mark.flaky
     def test_errors_are_reported(self, tmp_path_dist_ckpt, async_save, worker_fn):
         Utils.initialize_model_parallel(2, 4)
         sharded_state_dict = {

From d00cc116f53ded94c13485e2cd939a4105f28716 Mon Sep 17 00:00:00 2001
From: Anna Shors <ashors@nvidia.com>
Date: Sat, 26 Oct 2024 19:29:18 -0700
Subject: [PATCH 5/8] ADLR/megatron-lm!2195 - Remove guard blocking distributed
 optimizer when TE/Apex are not installed

---
 megatron/core/optimizer/distrib_optimizer.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/megatron/core/optimizer/distrib_optimizer.py b/megatron/core/optimizer/distrib_optimizer.py
index dfa8d51979..e814794f0b 100644
--- a/megatron/core/optimizer/distrib_optimizer.py
+++ b/megatron/core/optimizer/distrib_optimizer.py
@@ -18,7 +18,7 @@
     try:
         from apex.optimizers import FusedAdam as Adam
     except ImportError:
-        from torch.optim import Adam
+        from torch.optim import AdamW as Adam
 
         HAVE_APEX_OR_TE = False
 
@@ -462,10 +462,6 @@ def __init__(
         if has_config_logger_enabled(config):
             log_config_to_disk(config, locals(), prefix=type(self).__name__)
 
-        assert (
-            HAVE_APEX_OR_TE
-        ), f'Please install Apex or Transformer Engine to use DistributedOptimizer.'
-
         super().__init__(optimizer, config, grad_scaler, init_state_fn)
         self.model_chunks = model_chunks
         self.ddp_config = self.model_chunks[0].ddp_config

From 5b2f5b08e917ef9741d12cb46fff7de46095c4bf Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Sat, 26 Oct 2024 19:29:20 -0700
Subject: [PATCH 6/8] ADLR/megatron-lm!2255 - ci: Improvements around
 functional triggering

---
 .gitlab-ci.yml                                |   3 +
 .gitlab/stages/02.functional-tests.yml        |   2 +
 .../python_test_utils/jet/common.py           |  48 ++++++--
 .../jet/generate_jet_trigger_job.py           | 108 +++++++++++-------
 4 files changed, 110 insertions(+), 51 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index a93e1cb615..83d432ea71 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -94,6 +94,9 @@ variables:
   FUNCTIONAL_TEST_TIME_LIMIT:
     value: "1800"
     description: "Timeout in seconds per test"
+  FUNCTIONAL_TEST_CASES:
+    value: "all"
+    description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite."
   FUNCTIONAL_TEST_CLUSTER_A100:
     value: "dgxa100_dracooci"
     options:
diff --git a/.gitlab/stages/02.functional-tests.yml b/.gitlab/stages/02.functional-tests.yml
index db49c99c60..99d6b4888a 100644
--- a/.gitlab/stages/02.functional-tests.yml
+++ b/.gitlab/stages/02.functional-tests.yml
@@ -59,6 +59,7 @@ functional:configure:
         --environment dev \
         --n-repeat "$FUNCTIONAL_TEST_REPEAT" \
         --time-limit "$FUNCTIONAL_TEST_TIME_LIMIT" \
+        --test-cases $FUNCTIONAL_TEST_CASES \
         --a100-cluster $A100_CLUSTER \
         --h100-cluster $H100_CLUSTER \
         --container-image ${CI_MCORE_LTS_IMAGE} \
@@ -72,6 +73,7 @@ functional:configure:
         --environment lts \
         --n-repeat "$FUNCTIONAL_TEST_REPEAT" \
         --time-limit "$FUNCTIONAL_TEST_TIME_LIMIT" \
+        --test-cases $FUNCTIONAL_TEST_CASES \
         --a100-cluster $A100_CLUSTER \
         --h100-cluster $H100_CLUSTER \
         --container-image ${CI_MCORE_LTS_IMAGE} \
diff --git a/tests/functional_tests/python_test_utils/jet/common.py b/tests/functional_tests/python_test_utils/jet/common.py
index 9313e0a59c..301189e8e2 100644
--- a/tests/functional_tests/python_test_utils/jet/common.py
+++ b/tests/functional_tests/python_test_utils/jet/common.py
@@ -65,7 +65,7 @@ def load_and_flatten(config_path: str) -> List[jetclient.JETWorkloadManifest]:
 
 def filter_by_test_case(
     workload_manifests: List[jetclient.JETWorkloadManifest], test_case: str
-) -> jetclient.JETWorkloadManifest:
+) -> Optional[jetclient.JETWorkloadManifest]:
     """Returns a workload with matching name. Raises an error if there no or more than a single workload."""
     workload_manifests = list(
         workload_manifest
@@ -74,10 +74,12 @@ def filter_by_test_case(
     )
 
     if len(workload_manifests) > 1:
-        raise ValueError("Duplicate test_case found!")
+        print("Duplicate test_case found!")
+        return
 
     if len(workload_manifests) == 0:
-        raise ValueError("No test_case found!")
+        print("No test_case found!")
+        return
 
     return workload_manifests[0]
 
@@ -93,7 +95,8 @@ def filter_by_scope(
     )
 
     if len(workload_manifests) == 0:
-        raise ValueError("No test_case found!")
+        print("No test_case found!")
+        return []
 
     return workload_manifests
 
@@ -111,7 +114,8 @@ def filter_by_environment(
     )
 
     if len(workload_manifests) == 0:
-        raise ValueError("No test_case found!")
+        print("No test_case found!")
+        return []
 
     return workload_manifests
 
@@ -127,7 +131,26 @@ def filter_by_model(
     )
 
     if len(workload_manifests) == 0:
-        raise ValueError("No test_case found!")
+        print("No test_case found!")
+        return []
+
+    return workload_manifests
+
+
+def filter_by_test_cases(
+    workload_manifests: List[jetclient.JETWorkloadManifest], test_cases: str
+) -> List[jetclient.JETWorkloadManifest]:
+    """Returns a workload with matching name. Raises an error if there no or more than a single workload."""
+    workload_manifests = list(
+        workload_manifest
+        for workload_manifest in workload_manifests
+        for test_case in test_cases.split(",")
+        if workload_manifest.spec.test_case == test_case
+    )
+
+    if len(workload_manifests) == 0:
+        print("No test_case found!")
+        return []
 
     return workload_manifests
 
@@ -137,6 +160,7 @@ def load_workloads(
     n_repeat: int = 1,
     time_limit: int = 1800,
     environment: Optional[str] = None,
+    test_cases: str = "all",
     scope: Optional[str] = None,
     model: Optional[str] = None,
     test_case: Optional[str] = None,
@@ -156,15 +180,21 @@ def load_workloads(
     if scope:
         workloads = filter_by_scope(workload_manifests=workloads, scope=scope)
 
-    if environment:
+    if workloads and environment:
         workloads = filter_by_environment(workload_manifests=workloads, environment=environment)
 
-    if model:
+    if workloads and model:
         workloads = filter_by_model(workload_manifests=workloads, model=model)
 
-    if test_case:
+    if workloads and test_cases != "all":
+        workloads = filter_by_test_cases(workload_manifests=workloads, test_cases=test_cases)
+
+    if workloads and test_case:
         workloads = [filter_by_test_case(workload_manifests=workloads, test_case=test_case)]
 
+    if not workloads:
+        return []
+
     for workload in list(workloads):
         for build_workload in build_workloads:
             if (
diff --git a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
index 670072fc86..b21de4a22f 100644
--- a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
+++ b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
@@ -14,6 +14,9 @@
 @click.option("--environment", required=True, type=str, help="LTS or dev features")
 @click.option("--n-repeat", required=False, default=1, type=int)
 @click.option("--time-limit", required=False, default=1, type=int)
+@click.option(
+    "--test-cases", required=True, type=str, help="Comma-separated list of test_cases, or 'all'"
+)
 @click.option("--a100-cluster", required=True, type=str, help="A100 Cluster to run on")
 @click.option("--h100-cluster", required=True, type=str, help="H100 Cluster to run on")
 @click.option("--output-path", required=True, type=str, help="Path to write GitLab job to")
@@ -33,6 +36,7 @@ def main(
     environment: str,
     n_repeat: int,
     time_limit: int,
+    test_cases: str,
     a100_cluster: str,
     h100_cluster: str,
     output_path: str,
@@ -44,56 +48,76 @@ def main(
     test_cases = [
         test_case
         for test_case in common.load_workloads(
-            scope=scope, container_tag=container_tag, environment=environment
+            scope=scope, container_tag=container_tag, environment=environment, test_cases=test_cases
         )
         if test_case.type != "build"
     ]
 
-    gitlab_pipeline = {
-        "stages": list(set([test_case.spec.model for test_case in test_cases])),
-        "default": {"interruptible": True},
-    }
+    if not test_cases:
+        gitlab_pipeline = {
+            "stages": ["empty-pipeline-placeholder"],
+            "default": {"interruptible": True},
+            "empty-pipeline-placeholder-job": {
+                "stage": "empty-pipeline-placeholder",
+                "image": f"{container_image}:{container_tag}",
+                "tags": ["mcore-docker-node-jet"],
+                "rules": [
+                    {"if": '$CI_PIPELINE_SOURCE == "parent_pipeline"'},
+                    {"if": '$CI_MERGE_REQUEST_ID'},
+                ],
+                "timeout": "7 days",
+                "needs": [{"pipeline": '$PARENT_PIPELINE_ID', "job": "functional:configure"}],
+                "script": ["sleep 1"],
+                "artifacts": {"paths": ["results/"], "when": "always"},
+            },
+        }
+
+    else:
+        gitlab_pipeline = {
+            "stages": list(set([test_case.spec.model for test_case in test_cases])),
+            "default": {"interruptible": True},
+        }
 
-    for test_case in test_cases:
-        if test_case.spec.platforms == "dgx_a100":
-            cluster = a100_cluster
-        elif test_case.spec.platforms == "dgx_h100":
-            cluster = h100_cluster
-        else:
-            raise ValueError(f"Platform {test_case.spec.platforms} unknown")
+        for test_case in test_cases:
+            if test_case.spec.platforms == "dgx_a100":
+                cluster = a100_cluster
+            elif test_case.spec.platforms == "dgx_h100":
+                cluster = h100_cluster
+            else:
+                raise ValueError(f"Platform {test_case.spec.platforms} unknown")
 
-        script = [
-            "export PYTHONPATH=$(pwd); "
-            "python tests/functional_tests/python_test_utils/jet/launch_jet_workload.py",
-            f"--model {test_case.spec.model}",
-            f"--environment {test_case.spec.environment}",
-            f"--n-repeat {n_repeat}",
-            f"--time-limit {time_limit}",
-            f"--test-case {test_case.spec.test_case}",
-            f"--container-tag {container_tag}",
-            f"--cluster {cluster}",
-        ]
+            script = [
+                "export PYTHONPATH=$(pwd); "
+                "python tests/functional_tests/python_test_utils/jet/launch_jet_workload.py",
+                f"--model {test_case.spec.model}",
+                f"--environment {test_case.spec.environment}",
+                f"--n-repeat {n_repeat}",
+                f"--time-limit {time_limit}",
+                f"--test-case {test_case.spec.test_case}",
+                f"--container-tag {container_tag}",
+                f"--cluster {cluster}",
+            ]
 
-        if run_name is not None and wandb_experiment is not None:
-            script.append(f"--run-name {run_name}")
-            test_case.spec.model
-            script.append(
-                f"--wandb-experiment {wandb_experiment}-{test_case.spec.model}-{test_case.spec.test_case}"
-            )
+            if run_name is not None and wandb_experiment is not None:
+                script.append(f"--run-name {run_name}")
+                test_case.spec.model
+                script.append(
+                    f"--wandb-experiment {wandb_experiment}-{test_case.spec.model}-{test_case.spec.test_case}"
+                )
 
-        gitlab_pipeline[test_case.spec.test_case] = {
-            "stage": f"{test_case.spec.model}",
-            "image": f"{container_image}:{container_tag}",
-            "tags": ["mcore-docker-node-jet"],
-            "rules": [
-                {"if": '$CI_PIPELINE_SOURCE == "parent_pipeline"'},
-                {"if": '$CI_MERGE_REQUEST_ID'},
-            ],
-            "timeout": "7 days",
-            "needs": [{"pipeline": '$PARENT_PIPELINE_ID', "job": "functional:configure"}],
-            "script": [" ".join(script)],
-            "artifacts": {"paths": ["results/"], "when": "always"},
-        }
+            gitlab_pipeline[test_case.spec.test_case] = {
+                "stage": f"{test_case.spec.model}",
+                "image": f"{container_image}:{container_tag}",
+                "tags": ["mcore-docker-node-jet"],
+                "rules": [
+                    {"if": '$CI_PIPELINE_SOURCE == "parent_pipeline"'},
+                    {"if": '$CI_MERGE_REQUEST_ID'},
+                ],
+                "timeout": "7 days",
+                "needs": [{"pipeline": '$PARENT_PIPELINE_ID', "job": "functional:configure"}],
+                "script": [" ".join(script)],
+                "artifacts": {"paths": ["results/"], "when": "always"},
+            }
 
     with open(output_path, 'w') as outfile:
         yaml.dump(gitlab_pipeline, outfile, default_flow_style=False)

From 210162aebcfc68d72f39049d5cf84a83d3b11dea Mon Sep 17 00:00:00 2001
From: Xin Yao <xiny@nvidia.com>
Date: Mon, 28 Oct 2024 03:56:56 -0700
Subject: [PATCH 7/8] ADLR/megatron-lm!2201 - Make RoPE work with packed
 sequence and CP and Miscellaneous fixes

---
 .../core/extensions/transformer_engine.py     | 49 ++++++++------
 .../models/common/embeddings/rope_utils.py    | 65 +++++++++++++------
 .../common/embeddings/rotary_pos_embedding.py | 21 ++++--
 .../embeddings/yarn_rotary_pos_embedding.py   | 10 +++
 megatron/core/models/gpt/gpt_model.py         |  7 +-
 megatron/core/transformer/attention.py        | 10 ++-
 .../core/transformer/transformer_config.py    | 17 ++++-
 megatron/training/arguments.py                |  2 +
 .../model_config.yaml                         |  1 +
 .../model_config.yaml                         |  1 +
 10 files changed, 131 insertions(+), 52 deletions(-)

diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py
index 47606af27d..a33082d6f0 100644
--- a/megatron/core/extensions/transformer_engine.py
+++ b/megatron/core/extensions/transformer_engine.py
@@ -655,11 +655,6 @@ def forward(
         packed_seq_kwargs = (
             dataclasses.asdict(packed_seq_params) if packed_seq_params is not None else {}
         )
-        # overwrite self.qkv_format depending on self.config.apply_rope_fusion, which can be set
-        # after init
-        if self.config.apply_rope_fusion and is_te_min_version("0.13.0", check_equality=False):
-            self.qkv_format = 'bshd'
-
         qkv_format = packed_seq_kwargs.get('qkv_format', self.qkv_format)
 
         if get_te_version() < PkgVersion("1.3.0"):
@@ -676,17 +671,6 @@ def forward(
             packed_seq_kwargs.pop("cu_seqlens_q_padded", None)
             packed_seq_kwargs.pop("cu_seqlens_kv_padded", None)
 
-        if self.config.apply_rope_fusion and qkv_format == 'bshd':
-            query, key, value = [x.transpose(0, 1).contiguous() for x in (query, key, value)]
-            # In PyTorch, the following two tensors are in fact the same:
-            #   Tensor with shape (1, S, H, D) and stride (S*H*D, H*D, D, 1)
-            #   Tensor with shape (1, S, H, D) and stride (H*D, H*D, D, 1)
-            # Stride for a dimension that is 1 has no meaning, so tensors created two different ways
-            # can have same shape but different strides.
-            # We unify them to the first one to pass the stride check in TE
-            if value.shape == key.shape and value.shape[0] == 1 and value.stride() != key.stride():
-                value = value.as_strided(value.shape, key.stride())
-
         if self.te_forward_mask_type:
             if qkv_format == 'thd' and is_te_min_version("1.7.0"):
                 # thd format uses flash attention with cuDNN kernel which requires is_padding=True,
@@ -707,10 +691,7 @@ def forward(
         else:
             core_attn_out = super().forward(query, key, value, attention_mask, **packed_seq_kwargs)
 
-        if self.config.apply_rope_fusion and qkv_format == 'bshd':
-            return core_attn_out.transpose(0, 1)
-        else:
-            return core_attn_out
+        return core_attn_out
 
 
 if is_te_min_version("1.9.0.dev0"):
@@ -1091,3 +1072,31 @@ def get_cpu_offload_context(
 except ImportError:
 
     get_cpu_offload_context = None
+
+
+try:
+
+    from transformer_engine.pytorch.attention import FusedRoPEFunc
+
+    def fused_apply_rotary_pos_emb(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+        """Apply rotary positional embedding to input tensor T in `sbhd` format."""
+        return FusedRoPEFunc.apply(t, freqs, "sbhd")
+
+    def fused_apply_rotary_pos_emb_thd(
+        t: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        freqs: torch.Tensor,
+        cp_size: int = 1,
+        cp_rank: int = 0,
+    ) -> torch.Tensor:
+        """
+        Apply rotary positional embedding to input tensor T in `thd` format with CP support.
+        """
+        if is_te_min_version("1.11.0", check_equality=False):
+            return FusedRoPEFunc.apply(t, freqs, "thd", cu_seqlens, cp_size, cp_rank)
+        else:
+            return FusedRoPEFunc.apply(t, freqs, "thd", cu_seqlens)
+
+except ImportError:
+
+    pass
diff --git a/megatron/core/models/common/embeddings/rope_utils.py b/megatron/core/models/common/embeddings/rope_utils.py
index accb251961..fc7d355827 100644
--- a/megatron/core/models/common/embeddings/rope_utils.py
+++ b/megatron/core/models/common/embeddings/rope_utils.py
@@ -13,18 +13,27 @@
 from torch import Tensor
 
 from megatron.core import parallel_state
+from megatron.core.utils import is_te_min_version
 
 logger = logging.getLogger(__name__)
 
 try:
-    from apex.transformer.functional import (
+    from megatron.core.extensions.transformer_engine import (
         fused_apply_rotary_pos_emb,
         fused_apply_rotary_pos_emb_thd,
     )
 
     HAVE_APPLY_ROPE_FUSION = True
 except ImportError:
-    HAVE_APPLY_ROPE_FUSION = False
+    try:
+        from apex.transformer.functional import (
+            fused_apply_rotary_pos_emb,
+            fused_apply_rotary_pos_emb_thd,
+        )
+
+        HAVE_APPLY_ROPE_FUSION = True
+    except ImportError:
+        HAVE_APPLY_ROPE_FUSION = False
 
 
 def get_pos_emb_on_this_cp_rank(pos_emb: Tensor, seq_dim: int) -> Tensor:
@@ -103,6 +112,20 @@ def _apply_rotary_pos_emb_bshd(
     return torch.cat((t, t_pass), dim=-1)
 
 
+def _get_thd_freqs_on_this_cp_rank(cp_rank: int, cp_size: int, x: Tensor, freqs: Tensor) -> Tensor:
+    if cp_size > 1:
+        cp_seg = x.size(0) // 2
+        full_seqlen = cp_size * x.size(0)
+        return torch.cat(
+            [
+                freqs[cp_rank * cp_seg : (cp_rank + 1) * cp_seg],
+                freqs[full_seqlen - (cp_rank + 1) * cp_seg : full_seqlen - cp_rank * cp_seg],
+            ]
+        )
+    else:
+        return freqs[: x.size(0)]
+
+
 def _apply_rotary_pos_emb_thd(
     t: Tensor,
     cu_seqlens: Tensor,
@@ -123,12 +146,16 @@ def _apply_rotary_pos_emb_thd(
         Tensor: Shape [t, h, d]. The input tensor after applying RoPE.
     """
 
+    cp_size = parallel_state.get_context_parallel_world_size()
+    cp_rank = parallel_state.get_context_parallel_rank()
+    cu_seqlens = cu_seqlens // cp_size
     seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+
     return torch.cat(
         [
             _apply_rotary_pos_emb_bshd(
                 x.unsqueeze(1),
-                freqs[: x.size(0)],
+                _get_thd_freqs_on_this_cp_rank(cp_rank, cp_size, x, freqs),
                 rotary_interleaved=rotary_interleaved,
                 multi_latent_attention=multi_latent_attention,
                 mscale=mscale,
@@ -149,28 +176,24 @@ def apply_rotary_pos_emb(
     Reroute to the appropriate apply_rotary_pos_emb function depending on
     fused/unfused kernels, or bshd (conventional) / thd (packed seq) format
     """
-    if config.apply_rope_fusion and not HAVE_APPLY_ROPE_FUSION:
-        # setting apply_rope_fusion in config to False
-        # so that subsequent queries to this config also return False
-        config.apply_rope_fusion = False
-        if not getattr(apply_rotary_pos_emb, "printed_fused_warning", False):
-            logger.warning(
-                "Setting apply_rope_fusion to false because its implementation"
-                " is not included in Apex. Try upgrading to the latest version"
-            )
-            apply_rotary_pos_emb.printed_fused_warning = True
-
-    if getattr(config, "multi_latent_attention", False) and config.rotary_interleaved:
-        logger.warning(
-            "rotary_interleaved is not supported with multi_latent_attention, setting it to False"
-        )
-        config.rotary_interleaved = False
 
     if config.apply_rope_fusion:
         if cu_seqlens is None:
-            return fused_apply_rotary_pos_emb(t, freqs, transpose_output_memory=True)
+            return fused_apply_rotary_pos_emb(t, freqs)
         else:
-            return fused_apply_rotary_pos_emb_thd(t, cu_seqlens, freqs)
+            cp_size = parallel_state.get_context_parallel_world_size()
+            if cp_size > 1:
+                if not is_te_min_version("1.11.0", check_equality=False):
+                    raise ValueError("Only TE >= 1.12 supports RoPE fusion for THD format with CP.")
+                return fused_apply_rotary_pos_emb_thd(
+                    t,
+                    cu_seqlens,
+                    freqs,
+                    cp_size=cp_size,
+                    cp_rank=parallel_state.get_context_parallel_rank(),
+                )
+            else:
+                return fused_apply_rotary_pos_emb_thd(t, cu_seqlens, freqs)
     else:
         if cu_seqlens is None:
             return _apply_rotary_pos_emb_bshd(
diff --git a/megatron/core/models/common/embeddings/rotary_pos_embedding.py b/megatron/core/models/common/embeddings/rotary_pos_embedding.py
index 5232faec60..92c3efb379 100644
--- a/megatron/core/models/common/embeddings/rotary_pos_embedding.py
+++ b/megatron/core/models/common/embeddings/rotary_pos_embedding.py
@@ -7,9 +7,12 @@
 if TYPE_CHECKING:
     from megatron.core.transformer.transformer_config import TransformerConfig
     from megatron.core.transformer.transformer_block import TransformerBlock
+    from megatron.core.inference_params import InferenceParams
+    from megatron.core.packed_seq_params import PackedSeqParams
 
 import logging
 import math
+from functools import lru_cache
 
 import torch
 from torch import Tensor, nn
@@ -109,12 +112,14 @@ def _apply_scaling(
 
         return inv_freq_llama
 
-    def forward(self, max_seq_len: int, offset: int = 0) -> Tensor:
+    @lru_cache(maxsize=32)
+    def forward(self, max_seq_len: int, offset: int = 0, packed_seq: bool = False) -> Tensor:
         """Forward pass of RoPE embedding.
 
         Args:
             max_seq_len (int): Maximum size of sequence
-            offset (int, optional): _description_. Defaults to 0.
+            offset (int, optional): RoPE offset. Defaults to 0.
+            packed_seq (bool, optional): Whether to use packed sequence. Defaults to False.
 
         Returns:
             Tensor: Embeddings after applying RoPE.
@@ -141,7 +146,7 @@ def forward(self, max_seq_len: int, offset: int = 0) -> Tensor:
             )
         # emb [seq_length, .., dim]
         emb = emb[:, None, None, :]
-        if parallel_state.get_context_parallel_world_size() > 1:
+        if parallel_state.get_context_parallel_world_size() > 1 and not packed_seq:
             # slice rotary_pos_emb along sequence dimension and select the parition of the current
             # CP rank
             emb = get_pos_emb_on_this_cp_rank(emb, 0)
@@ -153,10 +158,11 @@ def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
 
     def get_rotary_seq_len(
         self,
-        inference_params,
+        inference_params: InferenceParams,
         transformer: TransformerBlock,
         transformer_input: Tensor,
         transformer_config: TransformerConfig,
+        packed_seq_params: PackedSeqParams,
     ) -> float:
         """Function to get the rotary sequence length.
 
@@ -166,11 +172,16 @@ def get_rotary_seq_len(
                 by the model
             transformer_input (Tensor): Input tensor to the transformer
             transformer_config (TransformerConfig): Transformer config used by the model
+            packed_seq_params (PackedSeqParams): Packed sequence params
 
         Returns:
             float: The rotary sequence length
         """
-        if inference_params is not None:
+        if packed_seq_params is not None:
+            # max_seqlen are the max sequence length in the packed sequence before being divived
+            # by the tp and cp size.
+            return max(packed_seq_params.max_seqlen_q, packed_seq_params.max_seqlen_kv)
+        elif inference_params is not None:
             rotary_seq_len = inference_params.max_sequence_length
         else:
             if transformer.input_tensor is not None:
diff --git a/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py
index 14d147ea34..3ab155dcdb 100644
--- a/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py
+++ b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py
@@ -4,6 +4,7 @@
 
 import logging
 import math
+from functools import lru_cache
 
 import torch
 from torch import Tensor
@@ -82,8 +83,17 @@ def __init__(
             use_cpu_initialization,
         )
 
+    @lru_cache(maxsize=32)
     def forward(self, max_seq_len: int, offset: int = 0) -> Tensor:
+        """Forward pass of Yarn Rotary Embedding.
 
+        Args:
+            max_seq_len (int): Maximum size of sequence
+            offset (int, optional): RoPE offset. Defaults to 0.
+
+        Returns:
+            Tensor: Embeddings after applying Yarn RoPE.
+        """
         assert (
             not self.rotary_interleaved
         ), "Yarn RoPE does not support interleaved rotary embeddings"
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index bd52f89680..f7567621f6 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -218,9 +218,12 @@ def forward(
         rotary_pos_emb = None
         if self.position_embedding_type == 'rope' and not self.config.multi_latent_attention:
             rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(
-                inference_params, self.decoder, decoder_input, self.config
+                inference_params, self.decoder, decoder_input, self.config, packed_seq_params
+            )
+            rotary_pos_emb = self.rotary_pos_emb(
+                rotary_seq_len,
+                packed_seq=packed_seq_params is not None and packed_seq_params.qkv_format == 'thd',
             )
-            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
 
         # Run decoder.
         hidden_states = self.decoder(
diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py
index 31fd8553e0..32fab28b49 100644
--- a/megatron/core/transformer/attention.py
+++ b/megatron/core/transformer/attention.py
@@ -283,8 +283,14 @@ def forward(
             q_pos_emb, k_pos_emb = rotary_pos_emb
 
             if packed_seq_params is not None:
-                cu_seqlens_q = packed_seq_params.cu_seqlens_q
-                cu_seqlens_kv = packed_seq_params.cu_seqlens_kv
+                if packed_seq_params.cu_seqlens_q_padded is not None:
+                    cu_seqlens_q = packed_seq_params.cu_seqlens_q_padded
+                else:
+                    cu_seqlens_q = packed_seq_params.cu_seqlens_q
+                if packed_seq_params.cu_seqlens_kv_padded is not None:
+                    cu_seqlens_kv = packed_seq_params.cu_seqlens_kv_padded
+                else:
+                    cu_seqlens_kv = packed_seq_params.cu_seqlens_kv
             else:
                 cu_seqlens_q = cu_seqlens_kv = None
             query = apply_rotary_pos_emb(
diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py
index c67913e164..8b374ca4be 100644
--- a/megatron/core/transformer/transformer_config.py
+++ b/megatron/core/transformer/transformer_config.py
@@ -493,11 +493,24 @@ def __post_init__(self):
                     "When bias_activation_fusion is True, gated_linear_unit is False, "
                     "and activation function is gelu, add_bias_linear must also be True."
                 )
+
         if self.activation_func_fp8_input_store:
             if self.activation_func != F.silu or not self.gated_linear_unit:
                 raise ValueError("Storing activation input in FP8 is supported only for SwiGLU.")
-        if self.apply_rope_fusion and self.rotary_interleaved:
-            raise ValueError('rotary_interleaved does not work with apply_rope_fusion.')
+
+        if self.apply_rope_fusion:
+            if self.rotary_interleaved:
+                raise ValueError("rotary_interleaved does not work with apply_rope_fusion.")
+
+            from megatron.core.models.common.embeddings.rope_utils import HAVE_APPLY_ROPE_FUSION
+
+            if not HAVE_APPLY_ROPE_FUSION:
+                raise ValueError(
+                    "apply_rope_fusion is not available. Please install TE >= 1.4 or Apex."
+                )
+
+        if self.multi_latent_attention and self.rotary_interleaved:
+            raise ValueError("rotary_interleaved does not work with multi_latent_attention.")
 
         if self.init_method is None:
             self.init_method = init_method_normal(self.init_method_std)
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index e3d876a5f2..64c92ea3cd 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -547,6 +547,8 @@ def validate_args(args, defaults={}):
         raise RuntimeError('--rotary-interleaved does not work with rope_fusion.')
     if args.rotary_interleaved and args.use_legacy_models:
         raise RuntimeError('--rotary-interleaved is not supported in legacy models.')
+    if args.position_embedding_type != 'rope':
+        args.apply_rope_fusion = False
 
     # Would just need to add 'NoPE' as a position_embedding_type to support this, but for now
     # don't allow it to keep things simple
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml
index 7bdd0c46e2..1649d326ec 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml
@@ -41,6 +41,7 @@ MODEL_ARGS:
   --tensor-model-parallel-size: 1
   --pipeline-model-parallel-size: 2
   --position-embedding-type: rope
+  --no-rope-fusion: true
   --no-ckpt-fully-parallel-save: true
   --deterministic-mode: true
   --no-gradient-accumulation-fusion: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml
index b2a1643ec8..6ca7dcf27f 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml
@@ -41,6 +41,7 @@ MODEL_ARGS:
   --tensor-model-parallel-size: 1
   --pipeline-model-parallel-size: 2
   --position-embedding-type: rope
+  --no-rope-fusion: true
   --no-ckpt-fully-parallel-save: true
   --deterministic-mode: true
   --no-gradient-accumulation-fusion: true

From aa6be133ac7530916501a7be4cc34c6dcc169694 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Mon, 28 Oct 2024 03:56:59 -0700
Subject: [PATCH 8/8] ADLR/megatron-lm!2270 - ci: Faster unit tests

---
 .gitlab-ci.yml                                | 14 ++---
 .gitlab/stages/00.pre.yml                     |  7 +--
 .gitlab/stages/01.test.yml                    | 60 ++++++++++++-------
 .../shell_test_utils/run_ci_test.sh           |  4 +-
 4 files changed, 50 insertions(+), 35 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 83d432ea71..649ffb447b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -13,8 +13,8 @@ workflow:
         FUNCTIONAL_TEST: "no"
     - if: $CI_MERGE_REQUEST_LABELS =~ /Run tests/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
       variables:
-        UNIT_TEST_REPEAT: 5
-        UNIT_TEST_TIMEOUT: 75
+        UNIT_TEST_REPEAT: 1
+        UNIT_TEST_TIMEOUT: 10
         FUNCTIONAL_TEST: "yes"
         FUNCTIONAL_TEST_SCOPE: mr
         FUNCTIONAL_TEST_REPEAT: 5
@@ -24,8 +24,8 @@ workflow:
         PUBLISH: "no"
     - if: $CI_MERGE_REQUEST_LABELS =~ /Run nightly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
       variables:
-        UNIT_TEST_REPEAT: 5
-        UNIT_TEST_TIMEOUT: 75
+        UNIT_TEST_REPEAT: 1
+        UNIT_TEST_TIMEOUT: 10
         FUNCTIONAL_TEST: "yes"
         FUNCTIONAL_TEST_SCOPE: nightly
         FUNCTIONAL_TEST_REPEAT: 5
@@ -35,8 +35,8 @@ workflow:
         PUBLISH: "no"
     - if: $CI_MERGE_REQUEST_LABELS =~ /Run weekly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
       variables:
-        UNIT_TEST_REPEAT: 5
-        UNIT_TEST_TIMEOUT: 75
+        UNIT_TEST_REPEAT: 1
+        UNIT_TEST_TIMEOUT: 10
         FUNCTIONAL_TEST: "yes"
         FUNCTIONAL_TEST_SCOPE: weekly
         FUNCTIONAL_TEST_REPEAT: 1
@@ -71,7 +71,7 @@ variables:
     value: "1"
     description: "Number of repetitions"
   UNIT_TEST_TIMEOUT: 
-    value: "15"
+    value: "10"
     description: Timeout (minutes) for Unit tests (all repeats)
   FUNCTIONAL_TEST: 
     value: "yes"
diff --git a/.gitlab/stages/00.pre.yml b/.gitlab/stages/00.pre.yml
index 82cc9514f1..1b9e453554 100644
--- a/.gitlab/stages/00.pre.yml
+++ b/.gitlab/stages/00.pre.yml
@@ -41,10 +41,9 @@ pre:create_ci_branches:
     matrix:
       - branch: ci-unit-test-extended
       - branch: ci-rebuild-mcore-nemo-image
-      - branch: ci-mr-a100
-      - branch: ci-nightly-a100
-      - branch: ci-weekly-a100
-      - branch: ci-weekly-h100
+      - branch: ci-mr
+      - branch: ci-nightly
+      - branch: ci-weekly
       - branch: ci-pre-release
   tags: [mcore-docker-node-small]
   stage: .pre
diff --git a/.gitlab/stages/01.test.yml b/.gitlab/stages/01.test.yml
index ca55de7d84..c12b5175ab 100644
--- a/.gitlab/stages/01.test.yml
+++ b/.gitlab/stages/01.test.yml
@@ -90,28 +90,50 @@ test:build_image:
   tags: [8xL40S]
   variables:
     GIT_STRATEGY: none
+  parallel:
+    matrix:
+      - BUCKET: tests/unit_tests/data/
+      - BUCKET: tests/unit_tests/dist_checkpointing/
+      - BUCKET: tests/unit_tests/distributed/ 
+      - BUCKET: tests/unit_tests/models/
+      - BUCKET: tests/unit_tests/pipeline_parallel/ tests/unit_tests/tensor_parallel/
+      - BUCKET: tests/unit_tests/transformer/
+      - BUCKET: other
   script:
-    - if [ $UNIT_TEST_REPEAT -eq 0 ]; then exit 0; fi;
-    - docker run --name mcore_ci_${CI_PIPELINE_ID} -d --rm -e TAG -e UNIT_TEST_REPEAT -e UNIT_TEST_TIMEOUT --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 ${IMAGE}:${CI_PIPELINE_ID} bash -c "sleep $(( ${UNIT_TEST_TIMEOUT} * 60 + 60 ))"
+    - docker run --name mcore_ci_${CI_PIPELINE_ID} -d --rm -e BUCKET -e TAG -e UNIT_TEST_REPEAT -e UNIT_TEST_TIMEOUT --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 ${IMAGE}:${CI_PIPELINE_ID} bash -c "sleep $(( ${UNIT_TEST_TIMEOUT} * 60 + 60 ))"
     - |
-      docker exec mcore_ci_${CI_PIPELINE_ID} bash -c '
-        set -e
+      CMD=$(cat <<"RUN_TEST_EOF"
+      set -euxo pipefail
         
-        MCORE_DIR=$([[ "$TAG" == "latest" ]] && echo "" || echo "-$TAG/")
+      MCORE_DIR=$([[ "$TAG" == "latest" ]] && echo "" || echo "-$TAG/")
 
-        cd /opt/megatron-lm$MCORE_DIR;
+      cd /opt/megatron-lm$MCORE_DIR;
 
-        for i in $(seq $UNIT_TEST_REPEAT); do
-          SEED=$((RANDOM % 9000 + 1000));
-          ARGS=()
-          if [[ $TAG != latest ]]; then
-            ARGS+=(-m "not internal and not flaky and not flaky_in_dev")
-          else
-            ARGS+=(-m "not flaky and not flaky_in_dev")
-          fi
-          timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest --random-order --random-order-seed ${SEED} -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${ARGS[@]}" tests/unit_tests
-        done
-      '
+      for i in $(seq $UNIT_TEST_REPEAT); do
+        SEED=$((RANDOM % 9000 + 1000));
+        ARGS=()
+        if [[ $TAG != latest ]]; then
+          ARGS+=(-m "not internal and not flaky and not flaky_in_dev")
+        else
+          ARGS+=(-m "not flaky and not flaky_in_dev")
+        fi
+
+        if [[ $BUCKET == other ]]; then
+          BUCKETS=($(cat /opt/megatron-lm/.gitlab/stages/01.test.yml | yq '.".unit_tests".parallel.matrix | del(.[] | select(.BUCKET == "other")) | .[].BUCKET' | tr " " "\n" | sed 's/[^ ]*/--ignore &/g' | tr "\n" " "))
+          ARGS+=(${BUCKETS[@]})
+          BUCKET=(tests/unit_tests)
+        else
+          BUCKET=(${BUCKET})
+        fi
+
+        if [[ -d $BUCKET ]]; then
+          timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${ARGS[@]}" ${BUCKET[@]}
+        fi
+      done
+      RUN_TEST_EOF
+      )
+      
+      docker exec mcore_ci_${CI_PIPELINE_ID} bash -c "$CMD"
   after_script:
     - docker container stop mcore_ci_${CI_PIPELINE_ID} || true
   artifacts:
@@ -135,8 +157,6 @@ test:pyt(LTS)_mcore(0.9.0):
   variables:
     TAG: core_r0.9.0
     IMAGE: ${CI_MCORE_LTS_IMAGE}
-    UNIT_TEST_REPEAT: 1
-    UNIT_TEST_TIMEOUT: 15
 
 test:pyt(DEV)_mcore(latest):
   extends: [.unit_tests]
@@ -149,8 +169,6 @@ test:pyt(DEV)_mcore(0.9.0):
   variables:
     TAG: core_r0.9.0
     IMAGE: ${CI_MCORE_DEV_IMAGE}
-    UNIT_TEST_REPEAT: 1
-    UNIT_TEST_TIMEOUT: 15
 
 test:notify_unit_tests:
   extends: [.test_rules]
diff --git a/tests/functional_tests/shell_test_utils/run_ci_test.sh b/tests/functional_tests/shell_test_utils/run_ci_test.sh
index 9dc22e3929..fac0704b4c 100644
--- a/tests/functional_tests/shell_test_utils/run_ci_test.sh
+++ b/tests/functional_tests/shell_test_utils/run_ci_test.sh
@@ -42,10 +42,8 @@ NVTE_ALLOW_NONDETERMINISTIC_ALGO=$(cat $TRAINING_PARAMS_PATH \
                                    | yq '.ENV_VARS.NVTE_ALLOW_NONDETERMINISTIC_ALGO')
 SKIP_PYTEST=$(cat $TRAINING_PARAMS_PATH \
               | yq '.ENV_VARS.SKIP_PYTEST')
-N_REPEATS=$(cat $TRAINING_PARAMS_PATH \
-              | yq '.ENV_VARS.N_REPEATS //1')
 
-for i in $(seq 1 $N_REPEATS);
+for i in $(seq 1 $N_REPEAT);
 do
     if [[ $i -gt 1 ]]; then
         rm -rf $CHECKPOINT_PATH/*