Skip to content

Commit

Permalink
Merge branch 'NVIDIA:main' into downproj-lr-scaling
Browse files Browse the repository at this point in the history
  • Loading branch information
dhia680 authored Oct 28, 2024
2 parents 54c9bdd + 38736c2 commit 8a2e40b
Show file tree
Hide file tree
Showing 172 changed files with 642 additions and 515 deletions.
44 changes: 34 additions & 10 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,51 +13,66 @@ workflow:
FUNCTIONAL_TEST: "no"
- if: $CI_MERGE_REQUEST_LABELS =~ /Run tests/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
UNIT_TEST_REPEAT: 5
UNIT_TEST_TIMEOUT: 75
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 10
FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST_SCOPE: mr
FUNCTIONAL_TEST_REPEAT: 5
FUNCTIONAL_TEST_TIME_LIMIT: 1800
FUNCTIONAL_TEST_CLUSTER_A100: ""
FUNCTIONAL_TEST_CLUSTER_H100: ""
PUBLISH: "no"
- if: $CI_MERGE_REQUEST_LABELS =~ /Run nightly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
UNIT_TEST_REPEAT: 5
UNIT_TEST_TIMEOUT: 75
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 10
FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST_SCOPE: nightly
FUNCTIONAL_TEST_REPEAT: 5
FUNCTIONAL_TEST_TIME_LIMIT: 1800
FUNCTIONAL_TEST_CLUSTER_A100: ""
FUNCTIONAL_TEST_CLUSTER_H100: ""
PUBLISH: "no"
- if: $CI_MERGE_REQUEST_LABELS =~ /Run weekly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
UNIT_TEST_REPEAT: 5
UNIT_TEST_TIMEOUT: 75
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 10
FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST_SCOPE: weekly
FUNCTIONAL_TEST_REPEAT: 1
FUNCTIONAL_TEST_TIME_LIMIT: 9000
FUNCTIONAL_TEST_CLUSTER_A100: ""
FUNCTIONAL_TEST_CLUSTER_H100: ""
PUBLISH: "no"
- if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
FUNCTIONAL_TEST: "no"
PUBLISH: "no"
- when: never
auto_cancel:
on_new_commit: interruptible

stages:
- test
- functional_tests
- convergence_tests
- publish

default:
interruptible: true

variables:
UNIT_TEST_TIMEOUT:
value: "15"
description: Timeout (minutes) for Unit tests (all repeats)
UNIT_TEST:
value: "yes"
options:
- "yes"
- "no"
description: To run the funtional test suite
UNIT_TEST_REPEAT:
value: "1"
description: "Number of repetitions"
UNIT_TEST_TIMEOUT:
value: "10"
description: Timeout (minutes) for Unit tests (all repeats)
FUNCTIONAL_TEST:
value: "yes"
options:
Expand All @@ -73,6 +88,15 @@ variables:
- "pre-release"
- "release"
description: "Testsuite to run (only for FUNCTIONAL_TEST=yes)"
FUNCTIONAL_TEST_REPEAT:
value: "5"
description: "Number of repetitions per test"
FUNCTIONAL_TEST_TIME_LIMIT:
value: "1800"
description: "Timeout in seconds per test"
FUNCTIONAL_TEST_CASES:
value: "all"
description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite."
FUNCTIONAL_TEST_CLUSTER_A100:
value: "dgxa100_dracooci"
options:
Expand Down
7 changes: 3 additions & 4 deletions .gitlab/stages/00.pre.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,9 @@ pre:create_ci_branches:
matrix:
- branch: ci-unit-test-extended
- branch: ci-rebuild-mcore-nemo-image
- branch: ci-mr-a100
- branch: ci-nightly-a100
- branch: ci-weekly-a100
- branch: ci-weekly-h100
- branch: ci-mr
- branch: ci-nightly
- branch: ci-weekly
- branch: ci-pre-release
tags: [mcore-docker-node-small]
stage: .pre
Expand Down
213 changes: 188 additions & 25 deletions .gitlab/stages/01.test.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
.test_rules:
rules:
- if: $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
- if: $UNIT_TEST == 'yes' && $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
allow_failure: true
when: on_success
- when: on_success
Expand Down Expand Up @@ -46,7 +46,7 @@ test:build_image:
ADDITIONAL_PARAMS=()
if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then
if [[ "$CI_COMMIT_BRANCH" == "ci-rebuild-mcore-nemo-image" ]]; then
ADDITIONAL_PARAMS+=("--pull")
ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:main")
fi
Expand Down Expand Up @@ -90,38 +90,60 @@ test:build_image:
tags: [8xL40S]
variables:
GIT_STRATEGY: none
parallel:
matrix:
- BUCKET: tests/unit_tests/data/
- BUCKET: tests/unit_tests/dist_checkpointing/
- BUCKET: tests/unit_tests/distributed/
- BUCKET: tests/unit_tests/models/
- BUCKET: tests/unit_tests/pipeline_parallel/ tests/unit_tests/tensor_parallel/
- BUCKET: tests/unit_tests/transformer/
- BUCKET: other
script:
- if [ $UNIT_TEST_REPEAT -eq 0 ]; then exit 0; fi;
- docker run --name mcore_ci_${CI_PIPELINE_ID} -d --rm -e TAG -e UNIT_TEST_REPEAT -e UNIT_TEST_TIMEOUT --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 ${IMAGE}:${CI_PIPELINE_ID} bash -c "sleep $(( ${UNIT_TEST_TIMEOUT} * 60 + 60 ))"
- docker run --name mcore_ci_${CI_PIPELINE_ID} -d --rm -e BUCKET -e TAG -e UNIT_TEST_REPEAT -e UNIT_TEST_TIMEOUT --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 ${IMAGE}:${CI_PIPELINE_ID} bash -c "sleep $(( ${UNIT_TEST_TIMEOUT} * 60 + 60 ))"
- |
docker exec mcore_ci_${CI_PIPELINE_ID} bash -c '
set -e
CMD=$(cat <<"RUN_TEST_EOF"
set -euxo pipefail
MCORE_DIR=$([[ "$TAG" == "latest" ]] && echo "" || echo "-$TAG/")
cd /opt/megatron-lm$MCORE_DIR;
for i in $(seq $UNIT_TEST_REPEAT); do
SEED=$((RANDOM % 9000 + 1000));
ARGS=()
if [[ $TAG != latest ]]; then
ARGS+=(-m "not internal and not flaky and not flaky_in_dev")
else
ARGS+=(-m "not flaky and not flaky_in_dev")
fi
timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest --random-order --random-order-seed ${SEED} -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${ARGS[@]}" tests/unit_tests
done
'
MCORE_DIR=$([[ "$TAG" == "latest" ]] && echo "" || echo "-$TAG/")
cd /opt/megatron-lm$MCORE_DIR;
for i in $(seq $UNIT_TEST_REPEAT); do
SEED=$((RANDOM % 9000 + 1000));
ARGS=()
if [[ $TAG != latest ]]; then
ARGS+=(-m "not internal and not flaky and not flaky_in_dev")
else
ARGS+=(-m "not flaky and not flaky_in_dev")
fi
if [[ $BUCKET == other ]]; then
BUCKETS=($(cat /opt/megatron-lm/.gitlab/stages/01.test.yml | yq '.".unit_tests".parallel.matrix | del(.[] | select(.BUCKET == "other")) | .[].BUCKET' | tr " " "\n" | sed 's/[^ ]*/--ignore &/g' | tr "\n" " "))
ARGS+=(${BUCKETS[@]})
BUCKET=(tests/unit_tests)
else
BUCKET=(${BUCKET})
fi
if [[ -d $BUCKET ]]; then
timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${ARGS[@]}" ${BUCKET[@]}
fi
done
RUN_TEST_EOF
)
docker exec mcore_ci_${CI_PIPELINE_ID} bash -c "$CMD"
after_script:
- docker container stop mcore_ci_${CI_PIPELINE_ID} || true
artifacts:
paths:
- coverage
rules:
- if: $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true" && $UNIT_TEST_REPEAT != '0'
- if: $UNIT_TEST == 'yes' && $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true" && $UNIT_TEST_REPEAT != '0'
allow_failure: true
when: on_success
- if: $UNIT_TEST_REPEAT != '0'
- if: $UNIT_TEST == 'yes' && $UNIT_TEST_REPEAT != '0'
when: on_success

test:pyt(LTS)_mcore(latest):
Expand All @@ -148,7 +170,7 @@ test:pyt(DEV)_mcore(0.9.0):
TAG: core_r0.9.0
IMAGE: ${CI_MCORE_DEV_IMAGE}

test:notify:
test:notify_unit_tests:
extends: [.test_rules]
image: ${CI_MCORE_LTS_IMAGE}:${CI_PIPELINE_ID}
needs:
Expand Down Expand Up @@ -229,4 +251,145 @@ test:secret_detection:
echo "Atleast one vulnerability has been found"
cat gl-secret-detection-report.json | jq '.'
exit 1
fi
fi
test:pypi_build_wheel:
extends: [.test_rules]
image:
name: quay.io/pypa/manylinux_2_28_x86_64
entrypoint: [""]
tags: [mcore-docker-node-small]
variables:
PUBLISH_DRYRUN: "yes"
script:
- echo $PUBLISH_DRYRUN
- >
if [ "$PUBLISH_DRYRUN" = "yes" ]; then
sed -i "/^PATCH/c\PATCH = $((RANDOM % 9000 + 1000))" megatron/core/package_info.py
fi
- /opt/python/cp310-cp310/bin/python -m build
- /opt/python/cp311-cp311/bin/python -m build
- auditwheel repair dist/*.whl
artifacts:
paths:
- megatron/core/package_info.py
- wheelhouse/

test:pypi_test_wheel:
extends: [.test_rules]
image: nvcr.io/nvidia/pytorch:24.01-py3
needs: [test:pypi_build_wheel]
tags: [mcore-docker-node-small]
variables:
PUBLISH_DRYRUN: "yes"
script:
- EXPECTED_RELEASE_NUMBER=$(python -c "from megatron import core; print(core.__version__)")
- rm -rf megatron
- pip install wheelhouse/*cp310*.whl

- RELEASE_NUMBER=$(python -c "from megatron import core; print(core.__version__)")
- >
echo "$EXPECTED_RELEASE_NUMBER" == "$RELEASE_NUMBER"
- test "$EXPECTED_RELEASE_NUMBER" == "$RELEASE_NUMBER"
artifacts:
paths:
- wheelhouse/

test:pypi_push_wheel:
extends: [.test_rules]
image: python:3.10
tags: [mcore-docker-node-small]
needs: [test:pypi_test_wheel]
variables:
PUBLISH_DRYRUN: "yes"
script:
- >
if [ "$PUBLISH_DRYRUN" = "yes" ]; then
REPOSITORY=testpypi
export TWINE_USERNAME=$TWINE_TEST_USERNAME
export TWINE_PASSWORT=$TWINE_TEST_PASSWORD
else
REPOSITORY=pypi
export TWINE_USERNAME=$TWINE_PROD_USERNAME
export TWINE_PASSWORT=$TWINE_PROD_PASSWORD
fi
- pip install twine
- twine upload -u $TWINE_USERNAME -p $TWINE_PASSWORT --repository $REPOSITORY wheelhouse/*

test:gh_release:
extends: [.test_rules]
tags: [mcore-docker-node-small]
image: nvcr.io/nvidia/pytorch:24.01-py3
variables:
PUBLISH_DRYRUN: "yes"
script:
- RELEASE_NUMBER=$(python -c "from megatron import core; print(core.__version__)")
- NAME="NVIDIA Megatron Core $RELEASE_NUMBER"
- CHANGELOG=$(awk '/^## '"$NAME"'/{flag=1; next} /^## /{flag=0} flag' CHANGELOG.md)
- CHANGELOG=$(echo "$CHANGELOG" | sed '/./!d')
- >
PAYLOAD=$(jq -nc \
--arg CI_COMMIT_BRANCH "$CI_COMMIT_BRANCH" \
--arg NAME "$NAME" \
--arg BODY "$CHANGELOG" \
'{
"tag_name": $CI_COMMIT_BRANCH,
"target_commitish": $CI_COMMIT_BRANCH,
"name": $NAME,
"body": $BODY,
"draft": false,
"prerelease": false,
"generate_release_notes": false
}'
)
- >
CMD=$(echo curl -L \
-X POST \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer $GH_TOKEN" \
-H "X-GitHub-Api-Version: 2022-11-28" \
https://api.github.com/repos/NVIDIA/Megatron-LM/releases \
-d "$PAYLOAD"
)
if [[ "$PUBLISH_DRYRUN" == "yes" ]]; then
echo "$CMD"
else
eval "$CMD"
fi
test:notify_release:
needs: [test:pypi_push_wheel, test:gh_release]
extends: [.test_rules]
image: nvcr.io/nvidia/pytorch:24.01-py3
tags: [mcore-docker-node-small]
variables:
PUBLISH_DRYRUN: "yes"
script:
- VERSION=$(python -c "from megatron import core; print(core.__version__)")
- URL="https://github.com/NVIDIA/Megatron-LM/releases/tag/core_r$VERSION"
- >
MESSAGE='{
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "Releasebot 🤖: Megatron-Core released <'$URL'|core_r'$VERSION'> 🚀"
}
}
]
}'
- echo "$MESSAGE"
- >
CMD=$(echo curl \
-X POST \
-H "Content-type: application/json" \
--data "$MESSAGE" ${MCORE_NOTIFICATION_HOOK_MAIN}
)
if [[ "$PUBLISH_DRYRUN" == "yes" ]]; then
echo "$CMD"
else
eval "$CMD"
fi
6 changes: 6 additions & 0 deletions .gitlab/stages/02.functional-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ functional:configure:
python tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py \
--scope $FUNCTIONAL_TEST_SCOPE \
--environment dev \
--n-repeat "$FUNCTIONAL_TEST_REPEAT" \
--time-limit "$FUNCTIONAL_TEST_TIME_LIMIT" \
--test-cases $FUNCTIONAL_TEST_CASES \
--a100-cluster $A100_CLUSTER \
--h100-cluster $H100_CLUSTER \
--container-image ${CI_MCORE_LTS_IMAGE} \
Expand All @@ -68,6 +71,9 @@ functional:configure:
python tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py \
--scope $FUNCTIONAL_TEST_SCOPE \
--environment lts \
--n-repeat "$FUNCTIONAL_TEST_REPEAT" \
--time-limit "$FUNCTIONAL_TEST_TIME_LIMIT" \
--test-cases $FUNCTIONAL_TEST_CASES \
--a100-cluster $A100_CLUSTER \
--h100-cluster $H100_CLUSTER \
--container-image ${CI_MCORE_LTS_IMAGE} \
Expand Down
Loading

0 comments on commit 8a2e40b

Please sign in to comment.