Skip to content

Commit 04d2ae9

Browse files
committed
initial version
1 parent 0736b6d commit 04d2ae9

File tree

3 files changed

+35
-14
lines changed

3 files changed

+35
-14
lines changed

.circleci/config.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,9 @@ workflows:
309309
- dev-mode: true
310310
with-mpi: 1
311311
image-type: tensorflow-ngc
312+
- dev-mode: true
313+
with-mpi: 1
314+
image-type: pytorch-ngc
312315

313316
- build-and-publish-docker:
314317
name: build-and-publish-docker-<<matrix.image-type>>-<<matrix.with-mpi>>-dev

Makefile

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -151,29 +151,42 @@ INFINITYHUB_PYTORCH_HPC_REPO := pytorch-infinityhub-hpc-dev
151151
# build hpc together since hpc is dependent on the normal build
152152
.PHONY: build-pytorch-ngc
153153
build-pytorch-ngc:
154-
docker build -f Dockerfile-pytorch-ngc \
154+
docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
155+
docker buildx create --name builder --driver docker-container --use
156+
docker buildx build -f Dockerfile-pytorch-ngc \
157+
--platform "$(PLATFORMS)" \
155158
--build-arg BASE_IMAGE="$(NGC_PYTORCH_PREFIX):$(NGC_PYTORCH_VERSION)" \
156159
-t $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_REPO):$(SHORT_GIT_HASH) \
160+
--push \
157161
.
158-
docker build -f Dockerfile-ngc-hpc \
159-
--build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_REPO):$(SHORT_GIT_HASH)" \
160-
-t $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_HPC_REPO):$(SHORT_GIT_HASH) \
161-
.
162-
docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m \"pytorch or deepspeed\" /workspace/tests"
163-
docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_HPC_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m \"pytorch or deepspeed\" /workspace/tests"
162+
163+
# docker build -f Dockerfile-ngc-hpc \
164+
# --build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_REPO):$(SHORT_GIT_HASH)" \
165+
# -t $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_HPC_REPO):$(SHORT_GIT_HASH) \
166+
# .
167+
# docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m \"pytorch or deepspeed\" /workspace/tests"
168+
# docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_HPC_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m \"pytorch or deepspeed\" /workspace/tests"
164169

165170
.PHONY: build-tensorflow-ngc
166171
build-tensorflow-ngc:
167-
docker build -f Dockerfile-tensorflow-ngc \
172+
# Binding QEMU to docker allows emulating other architectures.
173+
docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
174+
# The docker-container driver supports using QEMU (user mode) to build non-native platforms.
175+
# The Docker container driver allows creation of a managed and customizable BuildKit environment in a dedicated Docker container.
176+
docker buildx create --name builder --driver docker-container --use
177+
docker buildx build -f Dockerfile-tensorflow-ngc \
178+
--platform "$(PLATFORMS)" \
168179
--build-arg BASE_IMAGE="$(NGC_TENSORFLOW_PREFIX):$(NGC_TENSORFLOW_VERSION)" \
169180
-t $(DOCKERHUB_REGISTRY)/$(NGC_TF_REPO):$(SHORT_GIT_HASH) \
181+
--push \
170182
.
171-
docker build -f Dockerfile-ngc-hpc \
172-
--build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(NGC_TF_REPO):$(SHORT_GIT_HASH)" \
173-
-t $(DOCKERHUB_REGISTRY)/$(NGC_TF_HPC_REPO):$(SHORT_GIT_HASH) \
174-
.
175-
docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_TF_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m tensorflow /workspace/tests"
176-
docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_TF_HPC_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m tensorflow /workspace/tests"
183+
# docker build -f Dockerfile-ngc-hpc \
184+
# --build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(NGC_TF_REPO):$(SHORT_GIT_HASH)" \
185+
# -t $(DOCKERHUB_REGISTRY)/$(NGC_TF_HPC_REPO):$(SHORT_GIT_HASH) \
186+
# .
187+
# docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_TF_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m tensorflow /workspace/tests"
188+
# docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_TF_HPC_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m tensorflow /workspace/tests"
189+
177190

178191
ifeq ($(WITH_MPICH),1)
179192
ROCM56_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-mpich

dockerfile_scripts/install_deepspeed.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@ export DS_BUILD_SPARSE_ATTN=0
1313
export DS_BUILD_EVOFORMER_ATTN=0
1414
export DS_BUILD_CUTLASS_OPS=0
1515
export DS_BUILD_RAGGED_DEVICE_OPS=0
16+
export DS_BUILD_AIO=0
17+
# ARM64 CPU doesn't have Adam, Adagrad and Lion optimizers
18+
export DS_BUILD_CPU_ADAM=0
19+
export DS_BUILD_CPU_ADAGRAD=0
20+
export DS_BUILD_CPU_LION=0
1621

1722
#Remove 5.2 from TORCH_CUDA_ARCH_LIST, it is no longer supported by deepspeed
1823
export TORCH_CUDA_ARCH_LIST=`echo $TORCH_CUDA_ARCH_LIST|sed 's/5.2 //'`

0 commit comments

Comments
 (0)