Skip to content

feat: PyTorch Extras Container #26

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,25 @@ on:
tag-suffix:
required: false
type: string
outputs:
outcome:
description: "The outcome of the build"
value: ${{ jobs.build.outputs.outcome }}
tags:
description: "The resulting image tags"
value: ${{ jobs.build.outputs.tags }}
version:
description: "The resulting image version"
value: ${{ jobs.build.outputs.tags }}

jobs:
build:
name: Build Images
runs-on: [self-hosted, Linux]
outputs:
outcome: ${{ steps.docker-build.outcome }}
tags: ${{ steps.meta.outputs.tags }}
version: ${{ steps.meta.outputs.version }}
steps:
- uses: actions/checkout@v3
- name: Set up Docker Buildx
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/torch-base.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,5 @@ jobs:
base-image: nvidia/cuda:${{ matrix.cuda }}-base-ubuntu20.04
torch-version: ${{ matrix.torch }}
torchvision-version: ${{ matrix.vision }}
torchaudio-version: ${{ matrix.audio }}
torchaudio-version: ${{ matrix.audio }}
build-extras: true
3 changes: 2 additions & 1 deletion .github/workflows/torch-nccl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,5 @@ jobs:
base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-cudnn8-devel-ubuntu20.04-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
torch-version: ${{ matrix.torch }}
torchvision-version: ${{ matrix.vision }}
torchaudio-version: ${{ matrix.audio }}
torchaudio-version: ${{ matrix.audio }}
build-extras: true
22 changes: 20 additions & 2 deletions .github/workflows/torch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ on:
required: false
type: string
default: "7.0 7.5 8.0 8.6 8.9 9.0+PTX"
build-extras:
required: false
type: boolean
default: false

workflow_dispatch:
inputs:
Expand All @@ -48,11 +52,15 @@ on:
required: false
type: string
default: "7.0 7.5 8.0 8.6 8.9 9.0+PTX"
build-extras:
required: false
type: boolean
default: false

jobs:
build:
uses: ./.github/workflows/build.yml
with:
with:
image-name: torch
folder: torch
tag-suffix: ${{ inputs.tag }}
Expand All @@ -63,4 +71,14 @@ jobs:
BUILD_TORCH_VERSION=${{ inputs.torch-version }}
BUILD_TORCH_VISION_VERSION=${{ inputs.torchvision-version }}
BUILD_TORCH_AUDIO_VERSION=${{ inputs.torchaudio-version }}
${{ inputs.cuda-arch-support && format('BUILD_TORCH_CUDA_ARCH_LIST={0}', inputs.cuda-arch-support) || '' }}
${{ inputs.cuda-arch-support && format('BUILD_TORCH_CUDA_ARCH_LIST={0}', inputs.cuda-arch-support) || '' }}
build-extras:
if: inputs.build-extras
needs: build
uses: ./.github/workflows/build.yml
with:
image-name: torch-extras
folder: torch-extras
tag-suffix: ${{ inputs.tag }}
build-args: |
BASE_IMAGE=${{ needs.build.outputs.tags }}
134 changes: 134 additions & 0 deletions torch-extras/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# syntax=docker/dockerfile:1.2

ARG BASE_IMAGE
ARG DEEPSPEED_VERSION="0.9.4"
ARG FLASH_ATTN_VERSION="1.0.7"

FROM alpine/git:2.36.3 as flash-attn-downloader
WORKDIR /git
ARG FLASH_ATTN_VERSION
RUN git clone --recurse-submodules --shallow-submodules -j8 --depth 1 \
https://github.com/HazyResearch/flash-attention -b v${FLASH_ATTN_VERSION} && \
rm -rf flash-attention/.git


# Dependencies requiring NVCC are built ahead of time in a separate stage
# so that the ~2 GiB dev library installations don't have to be included
# in the final image.
FROM ${BASE_IMAGE} as builder-base
RUN export \
CUDA_MAJOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f1) \
CUDA_MINOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f2) && \
export \
CUDA_PACKAGE_VERSION="${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}" && \
apt-get -qq update && apt-get install -y --no-install-recommends \
cuda-nvcc-${CUDA_PACKAGE_VERSION} \
cuda-nvml-dev-${CUDA_PACKAGE_VERSION} \
libcurand-dev-${CUDA_PACKAGE_VERSION} \
libcublas-dev-${CUDA_PACKAGE_VERSION} \
libcusparse-dev-${CUDA_PACKAGE_VERSION} \
libcusolver-dev-${CUDA_PACKAGE_VERSION} \
cuda-nvprof-${CUDA_PACKAGE_VERSION} \
cuda-profiler-api-${CUDA_PACKAGE_VERSION} \
libaio-dev \
ninja-build \
parallel \
# gcc-10/g++-10/lld do not need to be installed here, but they improve the build.
# gfortran-10 is just for compiler_wrapper.f95.
gcc-10 g++-10 gfortran-10 lld && \
apt-get clean && \
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 10 && \
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 10 && \
update-alternatives --install \
/usr/bin/gfortran gfortran /usr/bin/gfortran-10 10 && \
update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld 1

RUN mkdir /wheels /build
WORKDIR /build

# DeepSpeed forces -march=native into the compiler options,
# making the result dependent on the processor architecture
# used on the builder machine.
# The compiler wrapper normalizes -march=native to -march=skylake
# along with a couple other transformations before invoking GCC.
COPY compiler_wrapper.f95 .
RUN gfortran -O3 ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95


FROM builder-base as deepspeed-builder
# DeepSpeed build flags
# See: https://www.deepspeed.ai/tutorials/advanced-install
ARG DS_BUILD_OPS="1"
ARG DS_BUILD_CPU_ADAM=""
ARG DS_BUILD_FUSED_ADAM=""
ARG DS_BUILD_FUSED_LAMB=""
# sparse_attn has issues with PyTorch >= 2.0.0 as of DeepSpeed 0.9.4
ARG DS_BUILD_SPARSE_ATTN="0"
ARG DS_BUILD_TRANSFORMER=""
ARG DS_BUILD_TRANSFORMER_INFERENCE=""
ARG DS_BUILD_STOCHASTIC_TRANSFORMER=""
ARG DS_BUILD_UTILS=""
ARG DS_BUILD_AIO=""

ARG DEEPSPEED_VERSION

SHELL ["/bin/bash", "-c"]
RUN python3 -m pip install -U --no-cache-dir \
setuptools wheel pip && \
{ \
# DeepSpeed doesn't handle blank environment variables
# in the same way as unset ones, so clear any blank ones.
for VAR in \
DS_BUILD_OPS \
DS_BUILD_CPU_ADAM \
DS_BUILD_FUSED_ADAM \
DS_BUILD_FUSED_LAMB \
DS_BUILD_SPARSE_ATTN \
DS_BUILD_TRANSFORMER \
DS_BUILD_TRANSFORMER_INFERENCE \
DS_BUILD_STOCHASTIC_TRANSFORMER \
DS_BUILD_UTILS \
DS_BUILD_AIO; \
do if [[ -z ${!VAR} ]]; then unset ${VAR}; fi; done; \
} && \
CC=$(realpath -e ./compiler) \
python3 -m pip wheel -w /wheels \
--no-cache-dir --no-build-isolation --no-deps \
deepspeed==${DEEPSPEED_VERSION} && \
rm ./*
SHELL ["/bin/sh", "-c"]

WORKDIR /wheels


FROM builder-base as flash-attn-builder
ARG FLASH_ATTN_VERSION

RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,target=flash-attention/ \
python3 -m pip install -U --no-cache-dir \
packaging setuptools wheel pip && \
export CC=$(realpath -e ./compiler) && \
cd flash-attention && \
parallel 'cd {} && python3 setup.py bdist_wheel --dist-dir /wheels' ::: \
. \
csrc/ft_attention \
csrc/fused_dense_lib \
csrc/fused_softmax \
csrc/layer_norm \
csrc/rotary \
csrc/xentropy

WORKDIR /wheels


FROM ${BASE_IMAGE}

RUN apt-get -qq update && \
apt-get install -y --no-install-recommends libaio-dev && \
apt-get clean

RUN --mount=type=bind,from=deepspeed-builder,source=/wheels,target=/tmp/wheels \
python3 -m pip install --no-cache-dir /tmp/wheels/*.whl
RUN --mount=type=bind,from=flash-attn-builder,source=/wheels,target=/tmp/wheels \
python3 -m pip install --no-cache-dir /tmp/wheels/*.whl
RUN rm -r /tmp/wheels
76 changes: 76 additions & 0 deletions torch-extras/compiler_wrapper.f95
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
PROGRAM compiler_wrapper
! Wraps GCC invocations,
! replacing -D__AVX512__ and -D__SCALAR__ preprocessor definitions
! with -D__AVX256__, and -march=native with -march=skylake,
! for better reproducibility and compatibility.
IMPLICIT NONE
INTEGER :: i, exitcode = 0, full_length = 0, truncated = 0
CHARACTER(len=:), ALLOCATABLE :: arg, command
ALLOCATE(CHARACTER(len=128) :: arg)
command = "gcc"

DO i = 1, COMMAND_ARGUMENT_COUNT()
DO
CALL GET_COMMAND_ARGUMENT(i, arg, full_length, truncated)
IF (truncated == 0) THEN
EXIT
ELSE IF (truncated == -1) THEN
DEALLOCATE(arg)
ALLOCATE(CHARACTER(len=full_length) :: arg)
ELSE
CALL EXIT(95)
END IF
END DO
IF (arg == "-march=native") THEN
command = command // " '-march=skylake'"
ELSE IF (arg == "-D__AVX512__" .OR. arg == "-D__SCALAR__") THEN
command = command // " '-D__AVX256__'"
ELSE
command = command // shell_escaped(arg)
END IF
END DO
CALL SYSTEM(command, exitcode)
IF (exitcode > 255) THEN
exitcode = MAX(IAND(exitcode, 255), 1)
END IF
CALL EXIT(exitcode)


CONTAINS
FUNCTION shell_escaped(str) RESULT(out)
! Turns [str] into [ 'str'] and replaces all
! internal ['] characters with ['"'"']
IMPLICIT NONE
CHARACTER(len=*), INTENT(IN) :: str
CHARACTER(len=:), ALLOCATABLE :: out
INTEGER :: old_i, out_i, old_len, out_len

old_len = LEN_TRIM(str)
! Figure out the new length to allocate by scanning `str`.
! This always needs to add at least [ '] at the beginning
! and ['] at the end, so the length increases by at least 3.
out_len = old_len + 3
DO old_i = 1, old_len
IF (str(old_i:old_i) == "'") THEN
out_len = out_len + 4
END IF
END DO
ALLOCATE(CHARACTER(len=out_len) :: out)

! Copy over the string, performing necessary escapes.
out(1:2) = " '"
out_i = 3
DO old_i = 1, old_len
IF (str(old_i:old_i) == "'") THEN
! Escape internal single-quotes
out(out_i:out_i + 4) = '''"''"'''
out_i = out_i + 5
ELSE
! No escaping needed
out(out_i:out_i) = str(old_i:old_i)
out_i = out_i + 1
END IF
END DO
out(out_i:out_i) = "'"
END FUNCTION
END PROGRAM