Skip to content

Commit fe918da

Browse files
committed
initial version
1 parent 0736b6d commit fe918da

File tree

3 files changed

+77
-56
lines changed

3 files changed

+77
-56
lines changed

.circleci/config.yml

Lines changed: 45 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -292,13 +292,14 @@ workflows:
292292
dev-mode: [true]
293293
with-mpi: [0, 1]
294294
image-type:
295-
- tensorflow-cpu
296-
- pytorch-cpu
297-
- tensorflow-cuda
298-
- pytorch-cuda
295+
# - tensorflow-cpu
296+
# - pytorch-cpu
297+
# - tensorflow-cuda
298+
# - pytorch-cuda
299299
- tensorflow-ngc
300-
- pytorch13-tf210-rocm56
301-
- pytorch20-tf210-rocm61
300+
# - pytorch13-tf210-rocm56
301+
# - pytorch20-tf210-rocm61
302+
- pytorch-ngc
302303
exclude:
303304
- dev-mode: true
304305
with-mpi: 1
@@ -309,50 +310,52 @@ workflows:
309310
- dev-mode: true
310311
with-mpi: 1
311312
image-type: tensorflow-ngc
313+
- dev-mode: true
314+
with-mpi: 1
315+
image-type: pytorch-ngc
312316

313-
- build-and-publish-docker:
314-
name: build-and-publish-docker-<<matrix.image-type>>-<<matrix.with-mpi>>-dev
315-
context: determined-production
316-
filters: *upstream-feature-branch
317-
requires:
318-
- request-publish-dev-docker
319-
matrix:
320-
alias: build-docker-all-gpu
321-
parameters:
322-
use-nvidia-runtime:
323-
- true
324-
dev-mode: [true]
325-
ci-image:
326-
- linux-cuda-11:default
327-
ci-resource_class:
328-
- gpu.nvidia.small.multi
329-
with-mpi: [0]
330-
image-type:
331-
- deepspeed-gpt-neox
332-
- pytorch-ngc
317+
# - build-and-publish-docker:
318+
# name: build-and-publish-docker-<<matrix.image-type>>-<<matrix.with-mpi>>-dev
319+
# context: determined-production
320+
# filters: *upstream-feature-branch
321+
# requires:
322+
# - request-publish-dev-docker
323+
# matrix:
324+
# alias: build-docker-all-gpu
325+
# parameters:
326+
# use-nvidia-runtime:
327+
# - true
328+
# dev-mode: [true]
329+
# ci-image:
330+
# - linux-cuda-11:default
331+
# ci-resource_class:
332+
# - gpu.nvidia.small.multi
333+
# with-mpi: [0]
334+
# image-type:
335+
# - deepspeed-gpt-neox
333336

334-
- build-and-publish-docker:
335-
name: build-and-publish-docker-<<matrix.image-type>>-<<matrix.with-mpi>>-<<matrix.with-ofi>>-dev
336-
context: determined-production
337-
filters: *upstream-feature-branch
338-
requires:
339-
- request-publish-dev-docker
340-
matrix:
341-
alias: build-docker-all-ofi
342-
parameters:
343-
dev-mode: [true]
344-
with-mpi: [1]
345-
with-ofi: [1]
346-
image-type:
347-
- tensorflow-cuda
337+
# - build-and-publish-docker:
338+
# name: build-and-publish-docker-<<matrix.image-type>>-<<matrix.with-mpi>>-<<matrix.with-ofi>>-dev
339+
# context: determined-production
340+
# filters: *upstream-feature-branch
341+
# requires:
342+
# - request-publish-dev-docker
343+
# matrix:
344+
# alias: build-docker-all-ofi
345+
# parameters:
346+
# dev-mode: [true]
347+
# with-mpi: [1]
348+
# with-ofi: [1]
349+
# image-type:
350+
# - tensorflow-cuda
348351

349352
- publish-cloud-images:
350353
name: publish-cloud-images-dev
351354
context: determined-production
352355
filters: *upstream-feature-branch
353356
requires:
354357
- build-docker-all-cpu
355-
- build-docker-all-gpu
356-
- build-docker-all-ofi
358+
# - build-docker-all-gpu
359+
# - build-docker-all-ofi
357360
- request-publish-dev-cloud
358361
dev-mode: true

Makefile

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -151,29 +151,42 @@ INFINITYHUB_PYTORCH_HPC_REPO := pytorch-infinityhub-hpc-dev
151151
# build hpc together since hpc is dependent on the normal build
152152
.PHONY: build-pytorch-ngc
153153
build-pytorch-ngc:
154-
docker build -f Dockerfile-pytorch-ngc \
154+
docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
155+
docker buildx create --name builder --driver docker-container --use
156+
docker buildx build -f Dockerfile-pytorch-ngc \
157+
--platform "$(PLATFORMS)" \
155158
--build-arg BASE_IMAGE="$(NGC_PYTORCH_PREFIX):$(NGC_PYTORCH_VERSION)" \
156159
-t $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_REPO):$(SHORT_GIT_HASH) \
160+
--push \
157161
.
158-
docker build -f Dockerfile-ngc-hpc \
159-
--build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_REPO):$(SHORT_GIT_HASH)" \
160-
-t $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_HPC_REPO):$(SHORT_GIT_HASH) \
161-
.
162-
docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m \"pytorch or deepspeed\" /workspace/tests"
163-
docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_HPC_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m \"pytorch or deepspeed\" /workspace/tests"
162+
163+
# docker build -f Dockerfile-ngc-hpc \
164+
# --build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_REPO):$(SHORT_GIT_HASH)" \
165+
# -t $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_HPC_REPO):$(SHORT_GIT_HASH) \
166+
# .
167+
# docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m \"pytorch or deepspeed\" /workspace/tests"
168+
# docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_HPC_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m \"pytorch or deepspeed\" /workspace/tests"
164169

165170
.PHONY: build-tensorflow-ngc
166171
build-tensorflow-ngc:
167-
docker build -f Dockerfile-tensorflow-ngc \
172+
# Binding QEMU to docker allows emulating other architectures.
173+
docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
174+
# The docker-container driver supports using QEMU (user mode) to build non-native platforms.
175+
# The Docker container driver allows creation of a managed and customizable BuildKit environment in a dedicated Docker container.
176+
docker buildx create --name builder --driver docker-container --use
177+
docker buildx build -f Dockerfile-tensorflow-ngc \
178+
--platform "$(PLATFORMS)" \
168179
--build-arg BASE_IMAGE="$(NGC_TENSORFLOW_PREFIX):$(NGC_TENSORFLOW_VERSION)" \
169180
-t $(DOCKERHUB_REGISTRY)/$(NGC_TF_REPO):$(SHORT_GIT_HASH) \
181+
--push \
170182
.
171-
docker build -f Dockerfile-ngc-hpc \
172-
--build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(NGC_TF_REPO):$(SHORT_GIT_HASH)" \
173-
-t $(DOCKERHUB_REGISTRY)/$(NGC_TF_HPC_REPO):$(SHORT_GIT_HASH) \
174-
.
175-
docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_TF_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m tensorflow /workspace/tests"
176-
docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_TF_HPC_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m tensorflow /workspace/tests"
183+
# docker build -f Dockerfile-ngc-hpc \
184+
# --build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(NGC_TF_REPO):$(SHORT_GIT_HASH)" \
185+
# -t $(DOCKERHUB_REGISTRY)/$(NGC_TF_HPC_REPO):$(SHORT_GIT_HASH) \
186+
# .
187+
# docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_TF_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m tensorflow /workspace/tests"
188+
# docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_TF_HPC_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m tensorflow /workspace/tests"
189+
177190

178191
ifeq ($(WITH_MPICH),1)
179192
ROCM56_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-mpich

dockerfile_scripts/install_deepspeed.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@ export DS_BUILD_SPARSE_ATTN=0
1313
export DS_BUILD_EVOFORMER_ATTN=0
1414
export DS_BUILD_CUTLASS_OPS=0
1515
export DS_BUILD_RAGGED_DEVICE_OPS=0
16+
export DS_BUILD_AIO=0
17+
# ARM64 CPU doesn't have Adam, Adagrad and Lion optimizers
18+
export DS_BUILD_CPU_ADAM=0
19+
export DS_BUILD_CPU_ADAGRAD=0
20+
export DS_BUILD_CPU_LION=0
1621

1722
#Remove 5.2 from TORCH_CUDA_ARCH_LIST, it is no longer supported by deepspeed
1823
export TORCH_CUDA_ARCH_LIST=`echo $TORCH_CUDA_ARCH_LIST|sed 's/5.2 //'`

0 commit comments

Comments
 (0)