Skip to content

Commit

Permalink
Merge pull request #2 from karpathy/master
Browse files Browse the repository at this point in the history
Keep up-to-date with original
  • Loading branch information
staar authored May 25, 2024
2 parents 64b6c2a + fe698b3 commit 3c29255
Show file tree
Hide file tree
Showing 46 changed files with 5,760 additions and 1,706 deletions.
102 changes: 95 additions & 7 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,16 @@ jobs:
build-and-test-cpu:
strategy:
matrix:
os: [ubuntu-latest, macos-latest]
os: [ubuntu-latest, macos-latest, windows-latest]

runs-on: ${{ matrix.os }}

steps:
- name: Checkout code
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Install OpenMP
if: matrix.os != 'windows-latest'
run: |
if [ "${{ runner.os }}" == "Linux" ]; then
sudo apt-get update && sudo apt-get install -y libomp-dev
Expand All @@ -32,31 +33,118 @@ jobs:
run: pip install -r requirements.txt

- name: Run preprocessing
run: python prepro_tinyshakespeare.py
run: python dev/data/tinyshakespeare.py

- name: Train model
run: python train_gpt2.py --device=cpu

- name: Download Win32 Make.exe
if: matrix.os == 'windows-latest'
run: |
$wc = New-Object System.Net.WebClient
$url = 'https://github.com/maweil/MakeForWindows/releases/download/v4.4.1/make-bin-win64.zip'
$output = './make-bin-win64.zip'
$wc.DownloadFile($url, $output)
- name: Unzip Win32 Makefile
if: matrix.os == 'windows-latest'
run: |
unzip make-bin-win64.zip
- name: Compile training and testing program
if: matrix.os != 'windows-latest'
run: make test_gpt2 train_gpt2

- name: Compile training and testing program for Windows
if: matrix.os == 'windows-latest'
shell: cmd
run: |
call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\VC\\Auxiliary\\Build\\vcvars64.bat"
make-4.4.1\dist\make WIN_CI_BUILD=1 test_gpt2 train_gpt2
- name: Execute testing program (With OpenMP)
if: matrix.os != 'windows-latest'
run: OMP_NUM_THREADS=8 ./test_gpt2

- name: Execute Windows testing program (With OpenMP)
if: matrix.os == 'windows-latest'
shell: cmd
run: |
copy test_gpt2 test_gpt2.exe
test_gpt2.exe
- name: Compile training and testing program without OpenMP
if: matrix.os != 'windows-latest'
run: NO_OMP=1 make test_gpt2 train_gpt2

- name: Execute testing program (No OpenMP)
if: matrix.os != 'windows-latest'
run: ./test_gpt2

build-cuda-windows:
runs-on: windows-latest
steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Download Win32 Make.exe
run: |
$wc = New-Object System.Net.WebClient
$url = 'https://github.com/maweil/MakeForWindows/releases/download/v4.4.1/make-bin-win64.zip'
$output = './make-bin-win64.zip'
$wc.DownloadFile($url, $output)
- name: Unzip Win32 Makefile
run: |
unzip make-bin-win64.zip
- name: Install Cuda Toolkit 12.4 on Windows
run: |
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
choco install unzip -y
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
# Default installation path for CUDA Toolkit is C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4
- name: Add Path
run: |
echo "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.4\\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
- name: Build Cuda targets
shell: cmd
working-directory: ${{ github.workspace }}
run: |
call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\VC\\Auxiliary\\Build\\vcvars64.bat"
make-4.4.1\dist\make -j WIN_CI_BUILD=1 train_gpt2fp32cu test_gpt2fp32cu test_gpt2cu train_gpt2cu profile_gpt2cu
build-cuda-fp32:
runs-on: ubuntu-latest
container:
image: nvidia/cuda:12.4.1-devel-ubuntu22.04

steps:
- name: Checkout code
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Build FP32 checkpoint
run: make train_gpt2fp32cu test_gpt2fp32cu
Expand All @@ -71,7 +159,7 @@ jobs:

steps:
- name: Checkout code
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Build project
run: PRECISION=BF16 make test_gpt2cu train_gpt2cu profile_gpt2cu
Expand All @@ -83,7 +171,7 @@ jobs:

steps:
- name: Checkout code
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Build project
run: PRECISION=FP16 make test_gpt2cu train_gpt2cu profile_gpt2cu
Expand All @@ -95,7 +183,7 @@ jobs:

steps:
- name: Checkout code
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Install OpenMP and OpenMPI
run: apt-get update && apt-get install -y libomp-dev libopenmpi-dev
Expand Down
13 changes: 10 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,17 @@
.vscode
.venv

# data files
data

# .bin files generated by Python
*.bin

# data directories
dev/data/__pycache__/
dev/data/fineweb10B/
dev/data/hellaswag/
dev/data/mmlu/
dev/data/tinyshakespeare/
dev/data/tinystories/

# binaries
test_gpt2
test_gpt2cu
Expand All @@ -22,8 +27,10 @@ dev/cuda/classifier_fused
dev/cuda/adamw
dev/cuda/matmul_backward_bias
dev/cuda/nccl_all_reduce
dev/cuda/global_norm
*.obj
*.exe
*.o

# log files
*.log
97 changes: 69 additions & 28 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,34 @@ NVCC_INCLUDES =
NVCC_LDLIBS =
NCLL_INCUDES =
NVCC_CUDNN =
# overridable flag for multi-GPU training. by default we won't build with cudnn
# because it bloats up the compile time from a few seconds to ~minute
# By default we don't build with cudnn because it blows up compile time from a few seconds to ~minute
USE_CUDNN ?= 0

# Function to check if a file exists in the PATH
ifneq ($(OS), Windows_NT)
define file_exists_in_path
$(which $(1) 2>/dev/null)
endef
else
define file_exists_in_path
$(shell where $(1) 2>nul)
endef
endif

ifneq ($(CI),true) # if not in CI, then use the GPU query
ifndef GPU_COMPUTE_CAPABILITY # set to defaults if: make GPU_COMPUTE_CAPABILITY=
ifneq ($(call file_exists_in_path, __nvcc_device_query),)
GPU_COMPUTE_CAPABILITY = $(shell __nvcc_device_query)
GPU_COMPUTE_CAPABILITY := $(strip $(GPU_COMPUTE_CAPABILITY))
endif
endif
endif

# set to defaults if - make GPU_COMPUTE_CAPABILITY= otherwise use the compute capability detected above
ifneq ($(GPU_COMPUTE_CAPABILITY),)
NVCC_FLAGS += --generate-code arch=compute_$(GPU_COMPUTE_CAPABILITY),code=[compute_$(GPU_COMPUTE_CAPABILITY),sm_$(GPU_COMPUTE_CAPABILITY)]
endif

# autodect a lot of various supports on current platform
$(info ---------------------------------------------)

Expand Down Expand Up @@ -67,27 +91,44 @@ else
endif

# Check and include cudnn if available
# Currently hard-coding a bunch of stuff here for Linux, todo make this better/nicer
# You need cuDNN from: https://developer.nvidia.com/cudnn
# Follow the apt-get instructions
# And the cuDNN front-end from: https://github.com/NVIDIA/cudnn-frontend/tree/main
# For this there is no installation, just download the repo to your home directory
# and then we include it below (see currently hard-coded path assumed in home directory)
# You can override the path to cudnn frontend by setting CUDNN_FRONTEND_PATH on the make command line
# By default, we look for it in HOME/cudnn-frontend/include and ./cudnn-frontend/include
# Refer to the README for cuDNN install instructions
ifeq ($(USE_CUDNN), 1)
ifeq ($(SHELL_UNAME), Linux)
# hard-coded path for now
CUDNN_FRONTEND_PATH := $(HOME)/cudnn-frontend/include
ifeq ($(shell [ -d $(CUDNN_FRONTEND_PATH) ] && echo "exists"), exists)
ifeq ($(shell [ -d $(HOME)/cudnn-frontend/include ] && echo "exists"), exists)
$(info ✓ cuDNN found, will run with flash-attention)
CUDNN_FRONTEND_PATH ?= $(HOME)/cudnn-frontend/include
else ifeq ($(shell [ -d cudnn-frontend/include ] && echo "exists"), exists)
$(info ✓ cuDNN found, will run with flash-attention)
CUDNN_FRONTEND_PATH ?= cudnn-frontend/include
else
$(error ✗ cuDNN not found. See the README for install instructions and the Makefile for hard-coded paths)
endif
NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH)
NVCC_LDFLAGS += -lcudnn
NVCC_FLAGS += -DENABLE_CUDNN
NVCC_CUDNN = cudnn_att.o
else
ifneq ($(OS), Windows_NT)
$(info → cuDNN is not supported on MAC OS right now)
else
$(info ✓ Windows cuDNN found, will run with flash-attention)
ifeq ($(shell if exist "$(HOMEDRIVE)$(HOMEPATH)\cudnn-frontend\include" (echo exists)),exists)
CUDNN_FRONTEND_PATH ?= $(HOMEDRIVE)$(HOMEPATH)\cudnn-frontend\include #override on command line if different location
else ifeq ($(shell if exist "cudnn-frontend\include" (echo exists)),exists)
CUDNN_FRONTEND_PATH ?= cudnn-frontend\include #override on command line if different location
else
$(error ✗ cuDNN not found. See the README for install instructions and the Makefile for hard-coded paths)
endif
CUDNN_INCLUDE_PATH ?= -I"C:\Program Files\NVIDIA\CUDNN\v9.1\include\12.4"
CUDNN_FRONTEND_PATH += $(CUDNN_INCLUDE_PATH)
NVCC_FLAGS += --std c++20 -Xcompiler "/std:c++20" -Xcompiler "/EHsc /W0 /nologo /Ox /FS" -maxrregcount=0 --machine 64
NVCC_CUDNN = cudnn_att.obj
NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH)
NVCC_LDFLAGS += -lcudnn
NVCC_LDFLAGS += -L"C:\Program Files\NVIDIA\CUDNN\v9.1\lib\12.4\x64" -lcudnn
NVCC_FLAGS += -DENABLE_CUDNN
NVCC_CUDNN = cudnn_att.o
else
$(error ✗ cuDNN not found. See the Makefile for our currently hard-coded paths / install instructions)
endif
else
$(info → cuDNN is not supported right now outside of Linux)
endif
else
$(info → cuDNN is manually disabled by default, run make with `USE_CUDNN=1` to try to enable)
Expand Down Expand Up @@ -183,36 +224,36 @@ ifeq ($(NVCC),)
$(info ✗ nvcc not found, skipping GPU/CUDA builds)
else
$(info ✓ nvcc found, including GPU/CUDA support)
TARGETS += train_gpt2cu test_gpt2cu train_gpt2fp32cu test_gpt2fp32cu
TARGETS += train_gpt2cu test_gpt2cu train_gpt2fp32cu test_gpt2fp32cu $(NVCC_CUDNN)
endif

$(info ---------------------------------------------)

all: $(TARGETS)

train_gpt2: train_gpt2.c
$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $< $(LDLIBS) $(OUTPUT_FILE)
$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $^ $(LDLIBS) $(OUTPUT_FILE)

test_gpt2: test_gpt2.c
$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $< $(LDLIBS) $(OUTPUT_FILE)
$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $^ $(LDLIBS) $(OUTPUT_FILE)

cudnn_att.o: cudnn_att.cu
$(NVCC) -c $(NVCC_FLAGS) $(PFLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS)
$(NVCC_CUDNN): cudnn_att.cpp
$(NVCC) -c $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_INCLUDES)

train_gpt2cu: train_gpt2.cu $(NVCC_CUDNN)
$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) $(NVCC_CUDNN)
$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)

train_gpt2fp32cu: train_gpt2_fp32.cu
$(NVCC) $(NVCC_FLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
$(NVCC) $(NVCC_FLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)

test_gpt2cu: test_gpt2.cu $(NVCC_CUDNN)
$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) $(NVCC_CUDNN)
$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)

test_gpt2fp32cu: test_gpt2_fp32.cu
$(NVCC) $(NVCC_FLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
$(NVCC) $(NVCC_FLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)

profile_gpt2cu: profile_gpt2.cu $(NVCC_CUDNN)
$(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) $(NVCC_CUDNN)
$(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)

clean:
$(REMOVE_FILES) $(TARGETS)
$(REMOVE_FILES) $(TARGETS) $(NVCC_CUDNN)
Loading

0 comments on commit 3c29255

Please sign in to comment.