diff --git a/.circleci/config.yml b/.circleci/config.yml index 2e736487..9c122250 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -74,7 +74,7 @@ jobs: default: ubuntu-2004:2022.04.2 ci-resource_class: type: string - default: xlarge + default: 2xlarge image-type: type: string dev-mode: @@ -292,13 +292,14 @@ workflows: dev-mode: [true] with-mpi: [0, 1] image-type: - - tensorflow-cpu - - pytorch-cpu - - tensorflow-cuda - - pytorch-cuda - - tensorflow-ngc - - pytorch13-tf210-rocm56 - - pytorch20-tf210-rocm61 + # - tensorflow-cpu + # - pytorch-cpu + # - tensorflow-cuda + # - pytorch-cuda + # - tensorflow-ngc + # - pytorch13-tf210-rocm56 + # - pytorch20-tf210-rocm61 + - pytorch-ngc exclude: - dev-mode: true with-mpi: 1 @@ -309,42 +310,44 @@ workflows: - dev-mode: true with-mpi: 1 image-type: tensorflow-ngc + - dev-mode: true + with-mpi: 1 + image-type: pytorch-ngc - - build-and-publish-docker: - name: build-and-publish-docker-<>-<>-dev - context: determined-production - filters: *upstream-feature-branch - requires: - - request-publish-dev-docker - matrix: - alias: build-docker-all-gpu - parameters: - use-nvidia-runtime: - - true - dev-mode: [true] - ci-image: - - linux-cuda-11:default - ci-resource_class: - - gpu.nvidia.small.multi - with-mpi: [0] - image-type: - - deepspeed-gpt-neox - - pytorch-ngc + # - build-and-publish-docker: + # name: build-and-publish-docker-<>-<>-dev + # context: determined-production + # filters: *upstream-feature-branch + # requires: + # - request-publish-dev-docker + # matrix: + # alias: build-docker-all-gpu + # parameters: + # use-nvidia-runtime: + # - true + # dev-mode: [true] + # ci-image: + # - linux-cuda-11:default + # ci-resource_class: + # - gpu.nvidia.small.multi + # with-mpi: [0] + # image-type: + # - deepspeed-gpt-neox - - build-and-publish-docker: - name: build-and-publish-docker-<>-<>-<>-dev - context: determined-production - filters: *upstream-feature-branch - requires: - - request-publish-dev-docker - matrix: - alias: build-docker-all-ofi - parameters: - dev-mode: [true] - with-mpi: [1] - with-ofi: [1] - image-type: - - tensorflow-cuda + # - build-and-publish-docker: + # name: build-and-publish-docker-<>-<>-<>-dev + # context: determined-production + # filters: *upstream-feature-branch + # requires: + # - request-publish-dev-docker + # matrix: + # alias: build-docker-all-ofi + # parameters: + # dev-mode: [true] + # with-mpi: [1] + # with-ofi: [1] + # image-type: + # - tensorflow-cuda - publish-cloud-images: name: publish-cloud-images-dev @@ -352,7 +355,7 @@ workflows: filters: *upstream-feature-branch requires: - build-docker-all-cpu - - build-docker-all-gpu - - build-docker-all-ofi + # - build-docker-all-gpu + # - build-docker-all-ofi - request-publish-dev-cloud dev-mode: true diff --git a/Makefile b/Makefile index 45f03780..787f181d 100644 --- a/Makefile +++ b/Makefile @@ -151,29 +151,42 @@ INFINITYHUB_PYTORCH_HPC_REPO := pytorch-infinityhub-hpc-dev # build hpc together since hpc is dependent on the normal build .PHONY: build-pytorch-ngc build-pytorch-ngc: - docker build -f Dockerfile-pytorch-ngc \ + docker run --rm --privileged multiarch/qemu-user-static --reset -p yes + docker buildx create --name builder --driver docker-container --use + docker buildx build -f Dockerfile-pytorch-ngc \ + --platform "linux/arm64" \ --build-arg BASE_IMAGE="$(NGC_PYTORCH_PREFIX):$(NGC_PYTORCH_VERSION)" \ -t $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_REPO):$(SHORT_GIT_HASH) \ + --push \ . - docker build -f Dockerfile-ngc-hpc \ - --build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_REPO):$(SHORT_GIT_HASH)" \ - -t $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_HPC_REPO):$(SHORT_GIT_HASH) \ - . - docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m \"pytorch or deepspeed\" /workspace/tests" - docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_HPC_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m \"pytorch or deepspeed\" /workspace/tests" + + # docker build -f Dockerfile-ngc-hpc \ + # --build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_REPO):$(SHORT_GIT_HASH)" \ + # -t $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_HPC_REPO):$(SHORT_GIT_HASH) \ + # . + # docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m \"pytorch or deepspeed\" /workspace/tests" + # docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_PYTORCH_HPC_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m \"pytorch or deepspeed\" /workspace/tests" .PHONY: build-tensorflow-ngc build-tensorflow-ngc: - docker build -f Dockerfile-tensorflow-ngc \ + # Binding QEMU to docker allows emulating other architectures. + docker run --rm --privileged multiarch/qemu-user-static --reset -p yes + # The docker-container driver supports using QEMU (user mode) to build non-native platforms. + # The Docker container driver allows creation of a managed and customizable BuildKit environment in a dedicated Docker container. + docker buildx create --name builder --driver docker-container --use + docker buildx build -f Dockerfile-tensorflow-ngc \ + --platform "$(PLATFORMS)" \ --build-arg BASE_IMAGE="$(NGC_TENSORFLOW_PREFIX):$(NGC_TENSORFLOW_VERSION)" \ -t $(DOCKERHUB_REGISTRY)/$(NGC_TF_REPO):$(SHORT_GIT_HASH) \ + --push \ . - docker build -f Dockerfile-ngc-hpc \ - --build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(NGC_TF_REPO):$(SHORT_GIT_HASH)" \ - -t $(DOCKERHUB_REGISTRY)/$(NGC_TF_HPC_REPO):$(SHORT_GIT_HASH) \ - . - docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_TF_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m tensorflow /workspace/tests" - docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_TF_HPC_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m tensorflow /workspace/tests" + # docker build -f Dockerfile-ngc-hpc \ + # --build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(NGC_TF_REPO):$(SHORT_GIT_HASH)" \ + # -t $(DOCKERHUB_REGISTRY)/$(NGC_TF_HPC_REPO):$(SHORT_GIT_HASH) \ + # . + # docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_TF_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m tensorflow /workspace/tests" + # docker run --rm -v `pwd`/tests:/workspace/tests -it $(DOCKERHUB_REGISTRY)/$(NGC_TF_HPC_REPO):$(SHORT_GIT_HASH) /bin/bash -c "pip install pytest && pytest -m tensorflow /workspace/tests" + ifeq ($(WITH_MPICH),1) ROCM56_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-mpich diff --git a/dockerfile_scripts/install_deepspeed.sh b/dockerfile_scripts/install_deepspeed.sh index 97c79170..c52da17d 100755 --- a/dockerfile_scripts/install_deepspeed.sh +++ b/dockerfile_scripts/install_deepspeed.sh @@ -13,6 +13,11 @@ export DS_BUILD_SPARSE_ATTN=0 export DS_BUILD_EVOFORMER_ATTN=0 export DS_BUILD_CUTLASS_OPS=0 export DS_BUILD_RAGGED_DEVICE_OPS=0 +export DS_BUILD_AIO=0 +# ARM64 CPU doesn't have Adam, Adagrad and Lion optimizers +export DS_BUILD_CPU_ADAM=0 +export DS_BUILD_CPU_ADAGRAD=0 +export DS_BUILD_CPU_LION=0 #Remove 5.2 from TORCH_CUDA_ARCH_LIST, it is no longer supported by deepspeed export TORCH_CUDA_ARCH_LIST=`echo $TORCH_CUDA_ARCH_LIST|sed 's/5.2 //'`