coreweave · wbrown · Jul 17, 2023 · Jun 17, 2023 · Jun 29, 2023
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -15,11 +15,25 @@ on:
       tag-suffix:
         required: false
         type: string
+    outputs:
+      outcome:
+        description: "The outcome of the build"
+        value: ${{ jobs.build.outputs.outcome }}
+      tags:
+        description: "The resulting image tags"
+        value: ${{ jobs.build.outputs.tags }}
+      version:
+        description: "The resulting image version"
+        value: ${{ jobs.build.outputs.tags }}
 
 jobs:
   build:
     name: Build Images
     runs-on: [self-hosted, Linux]
+    outputs:
+      outcome: ${{ steps.docker-build.outcome }}
+      tags: ${{ steps.meta.outputs.tags }}
+      version: ${{ steps.meta.outputs.version }}
     steps:
       - uses: actions/checkout@v3
       - name: Set up Docker Buildx

diff --git a/.github/workflows/torch-base.yml b/.github/workflows/torch-base.yml
@@ -25,4 +25,5 @@ jobs:
       base-image: nvidia/cuda:${{ matrix.cuda }}-base-ubuntu20.04
       torch-version: ${{ matrix.torch }}
       torchvision-version: ${{ matrix.vision }}
-      torchaudio-version: ${{ matrix.audio }}
+      torchaudio-version: ${{ matrix.audio }}
+      build-extras: true
diff --git a/.github/workflows/torch-nccl.yml b/.github/workflows/torch-nccl.yml
@@ -31,4 +31,5 @@ jobs:
       base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-cudnn8-devel-ubuntu20.04-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
       torch-version: ${{ matrix.torch }}
       torchvision-version: ${{ matrix.vision }}
-      torchaudio-version: ${{ matrix.audio }}
+      torchaudio-version: ${{ matrix.audio }}
+      build-extras: true
diff --git a/.github/workflows/torch.yml b/.github/workflows/torch.yml
@@ -23,6 +23,10 @@ on:
         required: false
         type: string
         default: "7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+      build-extras:
+        required: false
+        type: boolean
+        default: false
 
   workflow_dispatch:
     inputs:
@@ -48,11 +52,15 @@ on:
         required: false
         type: string
         default: "7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+      build-extras:
+        required: false
+        type: boolean
+        default: false
 
 jobs:
   build:
     uses: ./.github/workflows/build.yml
-    with: 
+    with:
       image-name: torch
       folder: torch
       tag-suffix: ${{ inputs.tag }}
@@ -63,4 +71,14 @@ jobs:
         BUILD_TORCH_VERSION=${{ inputs.torch-version }}
         BUILD_TORCH_VISION_VERSION=${{ inputs.torchvision-version }}
         BUILD_TORCH_AUDIO_VERSION=${{ inputs.torchaudio-version }}
-        ${{ inputs.cuda-arch-support && format('BUILD_TORCH_CUDA_ARCH_LIST={0}', inputs.cuda-arch-support) || '' }}
+        ${{ inputs.cuda-arch-support && format('BUILD_TORCH_CUDA_ARCH_LIST={0}', inputs.cuda-arch-support) || '' }}
+  build-extras:
+    if: inputs.build-extras
+    needs: build
+    uses: ./.github/workflows/build.yml
+    with:
+      image-name: torch-extras
+      folder: torch-extras
+      tag-suffix: ${{ inputs.tag }}
+      build-args: |
+        BASE_IMAGE=${{ needs.build.outputs.tags }}
diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile
@@ -0,0 +1,134 @@
+# syntax=docker/dockerfile:1.2
+
+ARG BASE_IMAGE
+ARG DEEPSPEED_VERSION="0.9.4"
+ARG FLASH_ATTN_VERSION="1.0.7"
+
+FROM alpine/git:2.36.3 as flash-attn-downloader
+WORKDIR /git
+ARG FLASH_ATTN_VERSION
+RUN git clone --recurse-submodules --shallow-submodules -j8 --depth 1 \
+      https://github.com/HazyResearch/flash-attention -b v${FLASH_ATTN_VERSION} && \
+    rm -rf flash-attention/.git
+
+
+# Dependencies requiring NVCC are built ahead of time in a separate stage
+# so that the ~2 GiB dev library installations don't have to be included
+# in the final image.
+FROM ${BASE_IMAGE} as builder-base
+RUN export \
+      CUDA_MAJOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f1) \
+      CUDA_MINOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f2) && \
+    export \
+      CUDA_PACKAGE_VERSION="${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}" && \
+    apt-get -qq update && apt-get install -y --no-install-recommends \
+      cuda-nvcc-${CUDA_PACKAGE_VERSION} \
+      cuda-nvml-dev-${CUDA_PACKAGE_VERSION} \
+      libcurand-dev-${CUDA_PACKAGE_VERSION} \
+      libcublas-dev-${CUDA_PACKAGE_VERSION} \
+      libcusparse-dev-${CUDA_PACKAGE_VERSION} \
+      libcusolver-dev-${CUDA_PACKAGE_VERSION} \
+      cuda-nvprof-${CUDA_PACKAGE_VERSION} \
+      cuda-profiler-api-${CUDA_PACKAGE_VERSION} \
+      libaio-dev \
+      ninja-build \
+      parallel \
+      # gcc-10/g++-10/lld do not need to be installed here, but they improve the build.
+      # gfortran-10 is just for compiler_wrapper.f95.
+      gcc-10 g++-10 gfortran-10 lld && \
+    apt-get clean && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 10 && \
+    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 10 && \
+    update-alternatives --install \
+      /usr/bin/gfortran gfortran /usr/bin/gfortran-10 10 && \
+    update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld 1
+
+RUN mkdir /wheels /build
+WORKDIR /build
+
+# DeepSpeed forces -march=native into the compiler options,
+# making the result dependent on the processor architecture
+# used on the builder machine.
+# The compiler wrapper normalizes -march=native to -march=skylake
+# along with a couple other transformations before invoking GCC.
+COPY compiler_wrapper.f95 .
+RUN gfortran -O3 ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95
+
+
+FROM builder-base as deepspeed-builder
+# DeepSpeed build flags
+# See: https://www.deepspeed.ai/tutorials/advanced-install
+ARG DS_BUILD_OPS="1"
+ARG DS_BUILD_CPU_ADAM=""
+ARG DS_BUILD_FUSED_ADAM=""
+ARG DS_BUILD_FUSED_LAMB=""
+# sparse_attn has issues with PyTorch >= 2.0.0 as of DeepSpeed 0.9.4
+ARG DS_BUILD_SPARSE_ATTN="0"
+ARG DS_BUILD_TRANSFORMER=""
+ARG DS_BUILD_TRANSFORMER_INFERENCE=""
+ARG DS_BUILD_STOCHASTIC_TRANSFORMER=""
+ARG DS_BUILD_UTILS=""
+ARG DS_BUILD_AIO=""
+
+ARG DEEPSPEED_VERSION
+
+SHELL ["/bin/bash", "-c"]
+RUN python3 -m pip install -U --no-cache-dir \
+      setuptools wheel pip && \
+    { \
+      # DeepSpeed doesn't handle blank environment variables
+      # in the same way as unset ones, so clear any blank ones.
+      for VAR in \
+        DS_BUILD_OPS \
+        DS_BUILD_CPU_ADAM \
+        DS_BUILD_FUSED_ADAM \
+        DS_BUILD_FUSED_LAMB \
+        DS_BUILD_SPARSE_ATTN \
+        DS_BUILD_TRANSFORMER \
+        DS_BUILD_TRANSFORMER_INFERENCE \
+        DS_BUILD_STOCHASTIC_TRANSFORMER \
+        DS_BUILD_UTILS \
+        DS_BUILD_AIO; \
+      do if [[ -z ${!VAR} ]]; then unset ${VAR}; fi; done; \
+    } && \
+    CC=$(realpath -e ./compiler) \
+      python3 -m pip wheel -w /wheels \
+      --no-cache-dir --no-build-isolation --no-deps \
+      deepspeed==${DEEPSPEED_VERSION} && \
+    rm ./*
+SHELL ["/bin/sh", "-c"]
+
+WORKDIR /wheels
+
+
+FROM builder-base as flash-attn-builder
+ARG FLASH_ATTN_VERSION
+
+RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,target=flash-attention/ \
+    python3 -m pip install -U --no-cache-dir \
+      packaging setuptools wheel pip && \
+    export CC=$(realpath -e ./compiler) && \
+    cd flash-attention && \
+    parallel 'cd {} && python3 setup.py bdist_wheel --dist-dir /wheels' ::: \
+      . \
+      csrc/ft_attention \
+      csrc/fused_dense_lib \
+      csrc/fused_softmax \
+      csrc/layer_norm \
+      csrc/rotary \
+      csrc/xentropy
+
+WORKDIR /wheels
+
+
+FROM ${BASE_IMAGE}
+
+RUN apt-get -qq update && \
+    apt-get install -y --no-install-recommends libaio-dev && \
+    apt-get clean
+
+RUN --mount=type=bind,from=deepspeed-builder,source=/wheels,target=/tmp/wheels \
+    python3 -m pip install --no-cache-dir /tmp/wheels/*.whl
+RUN --mount=type=bind,from=flash-attn-builder,source=/wheels,target=/tmp/wheels \
+    python3 -m pip install --no-cache-dir /tmp/wheels/*.whl
+RUN rm -r /tmp/wheels
diff --git a/torch-extras/compiler_wrapper.f95 b/torch-extras/compiler_wrapper.f95
@@ -0,0 +1,76 @@
+PROGRAM compiler_wrapper
+    ! Wraps GCC invocations,
+    ! replacing -D__AVX512__ and -D__SCALAR__ preprocessor definitions
+    ! with -D__AVX256__, and -march=native with -march=skylake,
+    ! for better reproducibility and compatibility.
+    IMPLICIT NONE
+    INTEGER :: i, exitcode = 0, full_length = 0, truncated = 0
+    CHARACTER(len=:), ALLOCATABLE :: arg, command
+    ALLOCATE(CHARACTER(len=128) :: arg)
+    command = "gcc"
+
+    DO i = 1, COMMAND_ARGUMENT_COUNT()
+        DO
+            CALL GET_COMMAND_ARGUMENT(i, arg, full_length, truncated)
+            IF (truncated == 0) THEN
+                EXIT
+            ELSE IF (truncated == -1) THEN
+                DEALLOCATE(arg)
+                ALLOCATE(CHARACTER(len=full_length) :: arg)
+            ELSE
+                CALL EXIT(95)
+            END IF
+        END DO
+        IF (arg == "-march=native") THEN
+            command = command // " '-march=skylake'"
+        ELSE IF (arg == "-D__AVX512__" .OR. arg == "-D__SCALAR__") THEN
+            command = command // " '-D__AVX256__'"
+        ELSE
+            command = command // shell_escaped(arg)
+        END IF
+    END DO
+    CALL SYSTEM(command, exitcode)
+    IF (exitcode > 255) THEN
+        exitcode = MAX(IAND(exitcode, 255), 1)
+    END IF
+    CALL EXIT(exitcode)
+
+
+    CONTAINS
+        FUNCTION shell_escaped(str) RESULT(out)
+            ! Turns [str] into [ 'str'] and replaces all
+            ! internal ['] characters with ['"'"']
+            IMPLICIT NONE
+            CHARACTER(len=*), INTENT(IN) :: str
+            CHARACTER(len=:), ALLOCATABLE :: out
+            INTEGER :: old_i, out_i, old_len, out_len
+
+            old_len = LEN_TRIM(str)
+            ! Figure out the new length to allocate by scanning `str`.
+            ! This always needs to add at least [ '] at the beginning
+            ! and ['] at the end, so the length increases by at least 3.
+            out_len = old_len + 3
+            DO old_i = 1, old_len
+                IF (str(old_i:old_i) == "'") THEN
+                    out_len = out_len + 4
+                END IF
+            END DO
+            ALLOCATE(CHARACTER(len=out_len) :: out)
+
+            ! Copy over the string, performing necessary escapes.
+            out(1:2) = " '"
+            out_i = 3
+            DO old_i = 1, old_len
+                IF (str(old_i:old_i) == "'") THEN
+                    ! Escape internal single-quotes
+                    out(out_i:out_i + 4) = '''"''"'''
+                    out_i = out_i + 5
+                ELSE
+                    ! No escaping needed
+                    out(out_i:out_i) = str(old_i:old_i)
+                    out_i = out_i + 1
+                END IF
+            END DO
+            out(out_i:out_i) = "'"
+        END FUNCTION
+END PROGRAM