initial version

jgongd · jgongd · commit fe918da656fc · 2024-08-20T13:49:22.000-04:00
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -292,13 +292,14 @@ workflows:
               dev-mode: [true]
               with-mpi: [0, 1]
               image-type:
-                - tensorflow-cpu
-                - pytorch-cpu
-                - tensorflow-cuda
-                - pytorch-cuda
+                # - tensorflow-cpu
+                # - pytorch-cpu
+                # - tensorflow-cuda
+                # - pytorch-cuda
                 - tensorflow-ngc
-                - pytorch13-tf210-rocm56
-                - pytorch20-tf210-rocm61
+                # - pytorch13-tf210-rocm56
+                # - pytorch20-tf210-rocm61
+                - pytorch-ngc
             exclude:
               - dev-mode: true
                 with-mpi: 1
@@ -309,50 +310,52 @@ workflows:
               - dev-mode: true
                 with-mpi: 1
                 image-type: tensorflow-ngc
+              - dev-mode: true
+                with-mpi: 1
+                image-type: pytorch-ngc
 
-      - build-and-publish-docker:
-          name: build-and-publish-docker-<<matrix.image-type>>-<<matrix.with-mpi>>-dev
-          context: determined-production
-          filters: *upstream-feature-branch
-          requires:
-            - request-publish-dev-docker
-          matrix:
-            alias: build-docker-all-gpu
-            parameters:
-              use-nvidia-runtime:
-                - true
-              dev-mode: [true]
-              ci-image:
-                - linux-cuda-11:default
-              ci-resource_class:
-                - gpu.nvidia.small.multi
-              with-mpi: [0]
-              image-type:
-                - deepspeed-gpt-neox
-                - pytorch-ngc
+      # - build-and-publish-docker:
+      #     name: build-and-publish-docker-<<matrix.image-type>>-<<matrix.with-mpi>>-dev
+      #     context: determined-production
+      #     filters: *upstream-feature-branch
+      #     requires:
+      #       - request-publish-dev-docker
+      #     matrix:
+      #       alias: build-docker-all-gpu
+      #       parameters:
+      #         use-nvidia-runtime:
+      #           - true
+      #         dev-mode: [true]
+      #         ci-image:
+      #           - linux-cuda-11:default
+      #         ci-resource_class:
+      #           - gpu.nvidia.small.multi
+      #         with-mpi: [0]
+      #         image-type:
+      #           - deepspeed-gpt-neox
 
-      - build-and-publish-docker:
-          name: build-and-publish-docker-<<matrix.image-type>>-<<matrix.with-mpi>>-<<matrix.with-ofi>>-dev
-          context: determined-production
-          filters: *upstream-feature-branch
-          requires:
-            - request-publish-dev-docker
-          matrix:
-            alias: build-docker-all-ofi
-            parameters:
-              dev-mode: [true]
-              with-mpi: [1]
-              with-ofi: [1]
-              image-type:
-                - tensorflow-cuda
+      # - build-and-publish-docker:
+      #     name: build-and-publish-docker-<<matrix.image-type>>-<<matrix.with-mpi>>-<<matrix.with-ofi>>-dev
+      #     context: determined-production
+      #     filters: *upstream-feature-branch
+      #     requires:
+      #       - request-publish-dev-docker
+      #     matrix:
+      #       alias: build-docker-all-ofi
+      #       parameters:
+      #         dev-mode: [true]
+      #         with-mpi: [1]
+      #         with-ofi: [1]
+      #         image-type:
+      #           - tensorflow-cuda
 
       - publish-cloud-images:
           name: publish-cloud-images-dev
           context: determined-production
           filters: *upstream-feature-branch
           requires:
             - build-docker-all-cpu
-            - build-docker-all-gpu
-            - build-docker-all-ofi
+            # - build-docker-all-gpu
+            # - build-docker-all-ofi
             - request-publish-dev-cloud
           dev-mode: true
diff --git a/Makefile b/Makefile
@@ -151,29 +151,42 @@ INFINITYHUB_PYTORCH_HPC_REPO := pytorch-infinityhub-hpc-dev
 # build hpc together since hpc is dependent on the normal build
 .PHONY: build-pytorch-ngc
 build-pytorch-ngc:
-	docker build -f Dockerfile-pytorch-ngc \
+	docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
+	docker buildx create --name builder --driver docker-container --use
+	docker buildx build -f Dockerfile-pytorch-ngc \
+		--platform "$(PLATFORMS)" \
 		--build-arg BASE_IMAGE="$(NGC_PYTORCH_PREFIX):$(NGC_PYTORCH_VERSION)" \
 		-t $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_REPO):$(SHORT_GIT_HASH) \
+		--push \
 		.
-	docker build -f Dockerfile-ngc-hpc \
-		--build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_REPO):$(SHORT_GIT_HASH)" \
-		-t $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_HPC_REPO):$(SHORT_GIT_HASH) \
-		.
-	docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m \"pytorch or deepspeed\" /workspace/tests"
-	docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_HPC_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m \"pytorch or deepspeed\" /workspace/tests"
+	
+	# docker build -f Dockerfile-ngc-hpc \
+	# 	--build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_REPO):$(SHORT_GIT_HASH)" \
+	# 	-t $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_HPC_REPO):$(SHORT_GIT_HASH) \
+	# 	.
+	# docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m \"pytorch or deepspeed\" /workspace/tests"
+	# docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_HPC_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m \"pytorch or deepspeed\" /workspace/tests"
 
 .PHONY: build-tensorflow-ngc
 build-tensorflow-ngc:
-	docker build -f Dockerfile-tensorflow-ngc \
+	# Binding QEMU to docker allows emulating other architectures.
+	docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
+	# The docker-container driver supports using QEMU (user mode) to build non-native platforms.
+	# The Docker container driver allows creation of a managed and customizable BuildKit environment in a dedicated Docker container.
+	docker buildx create --name builder --driver docker-container --use
+	docker buildx build -f Dockerfile-tensorflow-ngc \
+		--platform "$(PLATFORMS)" \
 		--build-arg BASE_IMAGE="$(NGC_TENSORFLOW_PREFIX):$(NGC_TENSORFLOW_VERSION)" \
 		-t $(DOCKERHUB_REGISTRY)/$(NGC_TF_REPO):$(SHORT_GIT_HASH) \
+		--push \
 		.
-	docker build -f Dockerfile-ngc-hpc \
-		--build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(NGC_TF_REPO):$(SHORT_GIT_HASH)" \
-		-t $(DOCKERHUB_REGISTRY)/$(NGC_TF_HPC_REPO):$(SHORT_GIT_HASH) \
-		.
-	docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_TF_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m tensorflow /workspace/tests"
-	docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_TF_HPC_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m tensorflow /workspace/tests"
+	# docker build -f Dockerfile-ngc-hpc \
+	# 	--build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(NGC_TF_REPO):$(SHORT_GIT_HASH)" \
+	# 	-t $(DOCKERHUB_REGISTRY)/$(NGC_TF_HPC_REPO):$(SHORT_GIT_HASH) \
+	# 	.
+	# docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_TF_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m tensorflow /workspace/tests"
+	# docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_TF_HPC_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m tensorflow /workspace/tests"
+
 
 ifeq ($(WITH_MPICH),1)
 ROCM56_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-mpich
diff --git a/dockerfile_scripts/install_deepspeed.sh b/dockerfile_scripts/install_deepspeed.sh
@@ -13,6 +13,11 @@ export DS_BUILD_SPARSE_ATTN=0
 export DS_BUILD_EVOFORMER_ATTN=0
 export DS_BUILD_CUTLASS_OPS=0
 export DS_BUILD_RAGGED_DEVICE_OPS=0
+export DS_BUILD_AIO=0
+# ARM64 CPU doesn't have Adam, Adagrad and Lion optimizers
+export DS_BUILD_CPU_ADAM=0
+export DS_BUILD_CPU_ADAGRAD=0
+export DS_BUILD_CPU_LION=0
 
 #Remove 5.2 from TORCH_CUDA_ARCH_LIST, it is no longer supported by deepspeed
 export TORCH_CUDA_ARCH_LIST=`echo $TORCH_CUDA_ARCH_LIST|sed 's/5.2 //'`