Skip to content

Commit f8c9649

Browse files
authored
Update min framework versions, test that Horovod builds with them (horovod#3452)
Signed-off-by: Enrico Minack <[email protected]>
1 parent a23732e commit f8c9649

9 files changed

+210
-16
lines changed

.buildkite/gen-pipeline.sh

+7-3
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ tests=$(if [[ -n "${PIPELINE_MODE:-}" ]] && ( [[ "${BUILDKITE_BRANCH:-}" == "${B
4747
# our baseline again
4848
# printf "test-cpu-gloo-py3_8-tf2_8_0-keras2_8_0-torch1_11_0-mxnet1_9_0-pyspark3_2_1 "
4949
printf "test-cpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_1 "
50+
# these are the lowest framework versions that Horovod compiles with, but they are not tested
51+
printf "test-cpu-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin "
5052
5153
# then we vary the frameworks for gpu
5254
# we need CUDA 10.0 as tensorflow-gpu==1.15.5 is compiled against and linked to CUDA 10.0
@@ -60,6 +62,8 @@ tests=$(if [[ -n "${PIPELINE_MODE:-}" ]] && ( [[ "${BUILDKITE_BRANCH:-}" == "${B
6062
printf "test-gpu-gloo-py3_8-tf2_7_1-keras2_7_0-torch1_10_2-mxnet1_8_0_p0-pyspark3_2_1 "
6163
printf "test-gpu-openmpi-gloo-py3_8-tf2_8_0-keras2_8_0-torch1_11_0-mxnet1_9_0-pyspark3_2_1 "
6264
printf "test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_1 "
65+
# these are the lowest framework versions that Horovod compiles with, but they are not tested
66+
printf "test-gpu-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin "
6367
6468
# and one final test with mixed cpu+gpu
6569
printf "test-mixed-openmpi-gloo-py3_8-tf2_8_0-keras2_8_0-torch1_11_0-mxnet1_9_0-pyspark3_2_1 "
@@ -414,7 +418,7 @@ oneccl_cmd_mpi="${oneccl_env}:echo:'/mpirun_command_mpi':>:/mpirun_command:&&"
414418
415419
# run all the cpu unit tests and integration tests
416420
for test in ${tests[@]-}; do
417-
if [[ ${test} == *-cpu-* ]]; then
421+
if [[ ${test} == *-cpu-* && ${test} != *min-* ]]; then
418422
# if gloo is specified, run gloo cpu unit tests and integration tests
419423
if [[ ${test} == *-gloo* ]]; then
420424
run_gloo ${test} ${cpu_queue}
@@ -453,7 +457,7 @@ echo "- wait"
453457
454458
# run 4x gpu unit tests
455459
for test in ${tests[@]-}; do
456-
if [[ ${test} == *-gpu-* ]] || [[ ${test} == *-mixed-* ]]; then
460+
if ( [[ ${test} == *-gpu-* ]] || [[ ${test} == *-mixed-* ]] ) && [[ ${test} != *min-* ]]; then
457461
# if gloo is specified, run gloo gpu unit tests
458462
if [[ ${test} == *-gloo* ]]; then
459463
run_gloo_pytest ${test} ${gpux4_queue}
@@ -471,7 +475,7 @@ echo "- wait"
471475
472476
# run 2x gpu integration tests
473477
for test in ${tests[@]-}; do
474-
if [[ ${test} == *-gpu-* ]] || [[ ${test} == *-mixed-* ]]; then
478+
if ( [[ ${test} == *-gpu-* ]] || [[ ${test} == *-mixed-* ]] ) && [[ ${test} != *min-* ]]; then
475479
# if gloo is specified, run gloo gpu integration tests
476480
if [[ ${test} == *-gloo* ]]; then
477481
run_gloo_integration ${test} ${gpux2_queue}

.github/gen-workflow-ci.py

+14-9
Original file line numberDiff line numberDiff line change
@@ -270,12 +270,14 @@ def build_and_test_images(id: str,
270270
name: str,
271271
needs: List[str],
272272
images: List[str],
273-
parallel_images: str,
274273
tests_per_image: Dict[str, Set[str]],
275274
tests: Dict[str, Dict],
275+
parallel_images: int = None,
276276
attempts: int = 3) -> str:
277277
if 'init-workflow' not in needs:
278278
needs.insert(0, 'init-workflow')
279+
if parallel_images is None:
280+
parallel_images = len(images)
279281
failure = "'failure'"
280282
return (f' {id}:\n'
281283
f' name: "{name} (${{{{ matrix.image }}}})"\n'
@@ -286,13 +288,13 @@ def build_and_test_images(id: str,
286288
f' runs-on: ubuntu-latest\n'
287289
f'\n'
288290
f' strategy:\n'
289-
f' max-parallel: {len([image for image in images if parallel_images in image])}\n'
291+
f' max-parallel: {parallel_images}\n'
290292
f' fail-fast: false\n'
291293
f' matrix:\n'
292294
f' include:\n' +
293295
'\n'.join([f' - image: {image}\n' +
294296
f''.join([f' {test}: true\n'
295-
for test in sorted(list(tests_per_image[image]))]) +
297+
for test in sorted(list(tests_per_image.get(image, [])))]) +
296298
f' build_timeout: {30 if "-cpu-" in image else 40}\n'
297299
for image in sorted(images)
298300
# oneccl does not compile on GitHub Workflows:
@@ -603,7 +605,7 @@ def publish_docker_images(needs: List[str], images: List[str]) -> str:
603605
return (f' docker-config:\n'
604606
f' name: Configure docker build\n'
605607
f' needs: [{", ".join(needs)}]\n'
606-
f" # build-and-test-cpu, build-gpu and buildkite might have been skipped (! needs.init-workflow.outputs.run-builds-and-tests)\n"
608+
f" # build-and-test and buildkite might have been skipped (! needs.init-workflow.outputs.run-builds-and-tests)\n"
607609
f' # buildkite might have been skipped (workflow runs for a fork PR),\n'
608610
f' # we still want to build docker images (though we might not want to push them)\n'
609611
f' if: >\n'
@@ -795,16 +797,19 @@ def sync_files(needs: List[str]) -> str:
795797
f' fi\n')
796798

797799
with open(path.joinpath('workflows', 'ci.yaml').absolute(), 'wt') as w:
800+
mins = ['tfmin', 'torchmin', 'mxnetmin']
798801
heads = ['tfhead', 'torchhead', 'mxnethead']
799-
release_images = [image for image in images if not all(head in image for head in heads)]
802+
allmin_images = [image for image in images if all(min in image for min in mins)]
803+
allhead_images = [image for image in images if all(head in image for head in heads)]
804+
release_images = [image for image in images if image not in allhead_images + allmin_images]
800805
cpu_release_images = [image for image in release_images if '-cpu-' in image]
801806
gpu_release_images = [image for image in release_images if '-gpu-' in image or '-mixed-' in image]
802-
allhead_images = [image for image in images if all(head in image for head in heads)]
803807
workflow = workflow_header() + jobs(
804808
init_workflow_job(),
805-
# changing these names require changes in the workflow-conclusion step in ci-fork.yaml
806-
build_and_test_images(id='build-and-test', name='Build and Test', needs=['init-workflow'], images=release_images, parallel_images='-cpu-', tests_per_image=tests_per_image, tests=tests),
807-
build_and_test_images(id='build-and-test-heads', name='Build and Test heads', needs=['build-and-test'], images=allhead_images, parallel_images='', tests_per_image=tests_per_image, tests=tests),
809+
# changing these names require changes in the workflow-conclusion step in ci-results.yaml
810+
build_and_test_images(id='build-and-test', name='Build and Test', needs=['init-workflow'], images=release_images, parallel_images=len(cpu_release_images), tests_per_image=tests_per_image, tests=tests),
811+
build_and_test_images(id='build-and-test-heads', name='Build and Test heads', needs=['build-and-test'], images=allhead_images, tests_per_image=tests_per_image, tests=tests),
812+
build_and_test_images(id='build-mins', name='Build mins', needs=['build-and-test'], images=allmin_images, tests_per_image=tests_per_image, tests={}),
808813
build_and_test_macos(id='build-and-test-macos', name='Build and Test macOS', needs=['build-and-test']),
809814
trigger_buildkite_job(id='buildkite', name='Build and Test GPU (on Builtkite)', needs=['build-and-test'], mode='GPU NON HEADS'),
810815
trigger_buildkite_job(id='buildkite-heads', name='Build and Test GPU heads (on Builtkite)', needs=['build-and-test'], mode='GPU HEADS'),

.github/workflows/ci-results.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ jobs:
3131

3232
steps:
3333
- name: Fetch workflow conclusion
34+
# fetch conclusion of steps building and testing CPU and building GPU
35+
# ignores steps building heads and mins, building and testing macOS, building and testing GPU via Buildkite
3436
id: workflow-conclusion
3537
env:
3638
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

.github/workflows/ci.yaml

+135-1
Original file line numberDiff line numberDiff line change
@@ -3353,6 +3353,140 @@ jobs:
33533353
docker image ls | head
33543354
shell: bash
33553355

3356+
build-mins:
3357+
name: "Build mins (${{ matrix.image }})"
3358+
needs: [init-workflow, build-and-test]
3359+
if: >
3360+
needs.init-workflow.outputs.run-at-all == 'true' &&
3361+
needs.init-workflow.outputs.run-builds-and-tests == 'true'
3362+
runs-on: ubuntu-latest
3363+
3364+
strategy:
3365+
max-parallel: 2
3366+
fail-fast: false
3367+
matrix:
3368+
include:
3369+
- image: test-cpu-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin
3370+
build_timeout: 30
3371+
3372+
- image: test-gpu-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin
3373+
build_timeout: 40
3374+
3375+
steps:
3376+
- name: Clean up disk space
3377+
# deleting these paths frees 38 GB disk space:
3378+
# sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc
3379+
# but this sometimes takes 3-4 minutes
3380+
# so we delete only some sub-paths which are known to be quick (10s) and 20 GB
3381+
run: |
3382+
echo ::group::Disk space before clean up
3383+
df -h
3384+
echo ::endgroup::
3385+
3386+
for dir in /usr/share/dotnet/sdk/\*/nuGetPackagesArchive.lzma \
3387+
/usr/share/dotnet/shared \
3388+
/usr/local/lib/android/sdk/ndk \
3389+
/usr/local/lib/android/sdk/build-tools \
3390+
/opt/ghc
3391+
do
3392+
echo ::group::Deleting "$dir"
3393+
sudo du -hsc $dir | tail -n1 || true
3394+
sudo rm -rf $dir
3395+
echo ::endgroup::
3396+
done
3397+
3398+
echo ::group::Disk space after clean up
3399+
df -h
3400+
echo ::endgroup::
3401+
3402+
- name: Checkout
3403+
uses: actions/checkout@v2
3404+
with:
3405+
submodules: recursive
3406+
3407+
- name: Setup Python
3408+
uses: actions/setup-python@v2
3409+
with:
3410+
python-version: 3.8
3411+
3412+
- name: Setup docker-compose
3413+
run: pip install docker-compose
3414+
3415+
- name: Configure AWS credentials
3416+
id: aws
3417+
uses: aws-actions/configure-aws-credentials@v1
3418+
# AWS credentials are used to authenticate against AWS ECR to pull and push test images
3419+
# We can only authenticate when running on Horovod repo (not a fork)
3420+
if: >
3421+
github.repository == 'horovod/horovod' &&
3422+
( github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository )
3423+
continue-on-error: true
3424+
with:
3425+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
3426+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
3427+
aws-region: us-east-1
3428+
3429+
- name: Login to Amazon ECR
3430+
id: ecr
3431+
if: steps.aws.outcome == 'success'
3432+
continue-on-error: true
3433+
uses: aws-actions/amazon-ecr-login@v1
3434+
3435+
- name: Add cache_from to docker-compose YAML
3436+
if: steps.ecr.outcome == 'success'
3437+
run: |
3438+
cat > docker-compose.test.override.yml <<EOF
3439+
version: '2.3'
3440+
services:
3441+
${{ matrix.image }}:
3442+
build:
3443+
cache_from:
3444+
- ${{ steps.ecr.outputs.registry }}/buildkite:horovod-${{ matrix.image }}-latest
3445+
EOF
3446+
cat docker-compose.test.override.yml
3447+
shell: bash
3448+
3449+
- name: Pull latest test image
3450+
if: steps.ecr.outcome == 'success'
3451+
continue-on-error: true
3452+
run: |
3453+
docker pull ${{ steps.ecr.outputs.registry }}/buildkite:horovod-${{ matrix.image }}-latest
3454+
env:
3455+
DOCKER_BUILDKIT: 1
3456+
3457+
- name: Build
3458+
id: build
3459+
run: |
3460+
override_yaml=""
3461+
if [ -e docker-compose.test.override.yml ]; then override_yaml="-f docker-compose.test.override.yml"; fi
3462+
.github/timeout-and-retry.sh ${{ matrix.build_timeout }}m 3 10 docker-compose -f docker-compose.test.yml $override_yaml build --pull ${{ matrix.image }}
3463+
env:
3464+
COMPOSE_DOCKER_CLI_BUILD: 1
3465+
DOCKER_BUILDKIT: 1
3466+
3467+
3468+
- name: Upload Test Results
3469+
uses: actions/upload-artifact@v2
3470+
if: always() && contains(matrix.image, '-cpu-')
3471+
with:
3472+
name: Unit Test Results - ${{ matrix.image }}
3473+
path: artifacts/${{ matrix.image }}/**/*.xml
3474+
3475+
- name: Push test image
3476+
# We push test image to AWS ECR on push to Horovod master (not a fork)
3477+
if: >
3478+
github.event_name == 'push' &&
3479+
github.ref == 'refs/heads/master' &&
3480+
github.repository == 'horovod/horovod' &&
3481+
steps.ecr.outcome == 'success'
3482+
continue-on-error: true
3483+
run: |
3484+
docker image ls | head
3485+
docker tag horovod_${{ matrix.image }} ${{ steps.ecr.outputs.registry }}/buildkite:horovod-${{ matrix.image }}-latest
3486+
docker push ${{ steps.ecr.outputs.registry }}/buildkite:horovod-${{ matrix.image }}-latest
3487+
docker image ls | head
3488+
shell: bash
3489+
33563490
build-and-test-macos:
33573491
name: "Build and Test macOS (${{ matrix.image }}-macos)"
33583492
needs: [init-workflow, build-and-test]
@@ -3604,7 +3738,7 @@ jobs:
36043738
docker-config:
36053739
name: Configure docker build
36063740
needs: [init-workflow, build-and-test, buildkite]
3607-
# build-and-test-cpu, build-gpu and buildkite might have been skipped (! needs.init-workflow.outputs.run-builds-and-tests)
3741+
# build-and-test and buildkite might have been skipped (! needs.init-workflow.outputs.run-builds-and-tests)
36083742
# buildkite might have been skipped (workflow runs for a fork PR),
36093743
# we still want to build docker images (though we might not want to push them)
36103744
if: >

docker-compose.test.yml

+32
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,21 @@ services:
9797
TORCHVISION_PACKAGE: torchvision
9898
PYTORCH_LIGHTNING_PACKAGE: pytorch_lightning==1.3.8
9999
MXNET_PACKAGE: mxnet-nightly
100+
# these are the lowest framework versions that Horovod compiles with, but they are not tested
101+
test-cpu-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin:
102+
extends: test-cpu-base
103+
build:
104+
args:
105+
UBUNTU_VERSION: 18.04
106+
PYTHON_VERSION: 3.7
107+
TENSORFLOW_PACKAGE: tensorflow-cpu==1.15.0
108+
KERAS_PACKAGE: keras==2.2.4
109+
PYTORCH_PACKAGE: torch==1.5.0+cpu
110+
PYTORCH_LIGHTNING_PACKAGE: pytorch-lightning==0.7.3
111+
TORCHVISION_PACKAGE: torchvision==0.6.0+cpu
112+
MXNET_PACKAGE: mxnet==1.4.1
113+
PYSPARK_PACKAGE: pyspark==2.4.0
114+
SPARK_PACKAGE: spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz
100115

101116
test-cpu-gloo-py3_7-tf2_8_0-keras2_8_0-torch1_11_0-mxnet1_9_0-pyspark2_4_8:
102117
extends: test-cpu-base
@@ -213,6 +228,23 @@ services:
213228
PYTORCH_LIGHTNING_PACKAGE: pytorch_lightning==1.3.8
214229
TORCHVISION_PACKAGE: torchvision
215230
MXNET_PACKAGE: mxnet-nightly-cu112
231+
# these are the lowest framework versions that Horovod compiles with, but they are not tested
232+
test-gpu-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin:
233+
extends: test-gpu-base
234+
build:
235+
args:
236+
CUDA_DOCKER_VERSION: 10.0-devel-ubuntu18.04
237+
CUDNN_VERSION: 7.6.5.32-1+cuda10.1
238+
NCCL_VERSION_OVERRIDE: 2.7.8-1+cuda10.1
239+
PYTHON_VERSION: 3.7
240+
TENSORFLOW_PACKAGE: tensorflow-gpu==1.15.0
241+
KERAS_PACKAGE: keras==2.2.4
242+
PYTORCH_PACKAGE: torch==1.5.0+cu101
243+
PYTORCH_LIGHTNING_PACKAGE: pytorch-lightning==0.7.3
244+
TORCHVISION_PACKAGE: torchvision==0.6.0+cu101
245+
MXNET_PACKAGE: mxnet-cu100==1.4.1
246+
PYSPARK_PACKAGE: pyspark==2.4.0
247+
SPARK_PACKAGE: spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz
216248

217249
test-mixed-openmpi-gloo-py3_8-tf2_8_0-keras2_8_0-torch1_11_0-mxnet1_9_0-pyspark3_2_1:
218250
extends: test-gpu-base

docs/install.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ Requirements
1010
- Python >= 3.6
1111
- `g++-5` or above, or another compiler supporting C++14
1212
- CMake 3.13 or newer
13-
- TensorFlow, PyTorch, or MXNet
13+
- TensorFlow (>=1.15.0), PyTorch (>=1.5.0), or MXNet (>=1.4.1)
1414
- (Optional) MPI
1515

1616
For best performance on GPU:

horovod/mxnet/CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ set(Mxnet_REQUIRED "")
99
if ("$ENV{HOROVOD_WITH_MXNET}" STREQUAL "1")
1010
set(Mxnet_REQUIRED "REQUIRED")
1111
endif ()
12-
find_package(Mxnet "1.4.0" ${Mxnet_REQUIRED})
12+
find_package(Mxnet "1.4.1" ${Mxnet_REQUIRED})
1313
if(NOT MXNET_FOUND)
1414
return()
1515
endif()

horovod/torch/CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ set(PYTORCH_REQUIRED "")
99
if ("$ENV{HOROVOD_WITH_PYTORCH}" STREQUAL "1")
1010
set(PYTORCH_REQUIRED "REQUIRED")
1111
endif ()
12-
find_package(Pytorch "1.2.0" ${PYTORCH_REQUIRED})
12+
find_package(Pytorch "1.5.0" ${PYTORCH_REQUIRED})
1313
if(NOT PYTORCH_FOUND)
1414
return()
1515
endif()

test/single/data/expected_buildkite_gpu_non_heads_pipeline.yaml

+17
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,23 @@ steps:
6767
automatic: true
6868
agents:
6969
queue: cpu-v572
70+
- label: ':docker: Build test-gpu-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin'
71+
env:
72+
COMPOSE_HTTP_TIMEOUT: 300
73+
plugins:
74+
- docker-compose#v3.5.0:
75+
build: test-gpu-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin
76+
image-repository: 823773083436.dkr.ecr.us-east-1.amazonaws.com/buildkite
77+
cache-from: test-gpu-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin:823773083436.dkr.ecr.us-east-1.amazonaws.com/buildkite:SLUG-test-gpu-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin-latest
78+
config: docker-compose.test.yml
79+
push-retries: 5
80+
- ecr#v1.2.0:
81+
login: true
82+
timeout_in_minutes: 40
83+
retry:
84+
automatic: true
85+
agents:
86+
queue: cpu-v572
7087
- label: ':docker: Build test-mixed-openmpi-gloo-py3_8-tf2_8_0-keras2_8_0-torch1_11_0-mxnet1_9_0-pyspark3_2_1'
7188
env:
7289
COMPOSE_HTTP_TIMEOUT: 300

0 commit comments

Comments
 (0)