diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 52715eb9c..e4c211bc1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,15 +12,16 @@ jobs: build-and-test-cpu: strategy: matrix: - os: [ubuntu-latest, macos-latest] + os: [ubuntu-latest, macos-latest, windows-latest] runs-on: ${{ matrix.os }} steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install OpenMP + if: matrix.os != 'windows-latest' run: | if [ "${{ runner.os }}" == "Linux" ]; then sudo apt-get update && sudo apt-get install -y libomp-dev @@ -32,23 +33,110 @@ jobs: run: pip install -r requirements.txt - name: Run preprocessing - run: python prepro_tinyshakespeare.py + run: python dev/data/tinyshakespeare.py - name: Train model run: python train_gpt2.py --device=cpu + - name: Download Win32 Make.exe + if: matrix.os == 'windows-latest' + run: | + $wc = New-Object System.Net.WebClient + $url = 'https://github.com/maweil/MakeForWindows/releases/download/v4.4.1/make-bin-win64.zip' + $output = './make-bin-win64.zip' + $wc.DownloadFile($url, $output) + + - name: Unzip Win32 Makefile + if: matrix.os == 'windows-latest' + run: | + unzip make-bin-win64.zip + - name: Compile training and testing program + if: matrix.os != 'windows-latest' run: make test_gpt2 train_gpt2 + - name: Compile training and testing program for Windows + if: matrix.os == 'windows-latest' + shell: cmd + run: | + call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\VC\\Auxiliary\\Build\\vcvars64.bat" + make-4.4.1\dist\make WIN_CI_BUILD=1 test_gpt2 train_gpt2 + - name: Execute testing program (With OpenMP) + if: matrix.os != 'windows-latest' run: OMP_NUM_THREADS=8 ./test_gpt2 + - name: Execute Windows testing program (With OpenMP) + if: matrix.os == 'windows-latest' + shell: cmd + run: | + copy test_gpt2 test_gpt2.exe + test_gpt2.exe + - name: Compile training and testing program without OpenMP + if: matrix.os != 'windows-latest' run: NO_OMP=1 make test_gpt2 train_gpt2 - name: Execute testing program (No OpenMP) + if: matrix.os != 'windows-latest' run: ./test_gpt2 + build-cuda-windows: + runs-on: windows-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Download Win32 Make.exe + run: | + $wc = New-Object System.Net.WebClient + $url = 'https://github.com/maweil/MakeForWindows/releases/download/v4.4.1/make-bin-win64.zip' + $output = './make-bin-win64.zip' + $wc.DownloadFile($url, $output) + + - name: Unzip Win32 Makefile + run: | + unzip make-bin-win64.zip + + - name: Install Cuda Toolkit 12.4 on Windows + run: | + mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" + choco install unzip -y + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip" + unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + + # Default installation path for CUDA Toolkit is C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4 + - name: Add Path + run: | + echo "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.4\\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 + echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 + + - name: Build Cuda targets + shell: cmd + working-directory: ${{ github.workspace }} + run: | + call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\VC\\Auxiliary\\Build\\vcvars64.bat" + make-4.4.1\dist\make -j WIN_CI_BUILD=1 train_gpt2fp32cu test_gpt2fp32cu test_gpt2cu train_gpt2cu profile_gpt2cu + build-cuda-fp32: runs-on: ubuntu-latest container: @@ -56,7 +144,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Build FP32 checkpoint run: make train_gpt2fp32cu test_gpt2fp32cu @@ -71,7 +159,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Build project run: PRECISION=BF16 make test_gpt2cu train_gpt2cu profile_gpt2cu @@ -83,7 +171,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Build project run: PRECISION=FP16 make test_gpt2cu train_gpt2cu profile_gpt2cu @@ -95,7 +183,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install OpenMP and OpenMPI run: apt-get update && apt-get install -y libomp-dev libopenmpi-dev diff --git a/.gitignore b/.gitignore index 5e88e4285..4f6c4a0c7 100644 --- a/.gitignore +++ b/.gitignore @@ -2,12 +2,17 @@ .vscode .venv -# data files -data - # .bin files generated by Python *.bin +# data directories +dev/data/__pycache__/ +dev/data/fineweb10B/ +dev/data/hellaswag/ +dev/data/mmlu/ +dev/data/tinyshakespeare/ +dev/data/tinystories/ + # binaries test_gpt2 test_gpt2cu @@ -22,8 +27,10 @@ dev/cuda/classifier_fused dev/cuda/adamw dev/cuda/matmul_backward_bias dev/cuda/nccl_all_reduce +dev/cuda/global_norm *.obj *.exe +*.o # log files *.log diff --git a/Makefile b/Makefile index 04cbfbb2a..c8b555ac2 100644 --- a/Makefile +++ b/Makefile @@ -19,10 +19,34 @@ NVCC_INCLUDES = NVCC_LDLIBS = NCLL_INCUDES = NVCC_CUDNN = -# overridable flag for multi-GPU training. by default we won't build with cudnn -# because it bloats up the compile time from a few seconds to ~minute +# By default we don't build with cudnn because it blows up compile time from a few seconds to ~minute USE_CUDNN ?= 0 +# Function to check if a file exists in the PATH +ifneq ($(OS), Windows_NT) +define file_exists_in_path + $(which $(1) 2>/dev/null) +endef +else +define file_exists_in_path + $(shell where $(1) 2>nul) +endef +endif + +ifneq ($(CI),true) # if not in CI, then use the GPU query + ifndef GPU_COMPUTE_CAPABILITY # set to defaults if: make GPU_COMPUTE_CAPABILITY= + ifneq ($(call file_exists_in_path, __nvcc_device_query),) + GPU_COMPUTE_CAPABILITY = $(shell __nvcc_device_query) + GPU_COMPUTE_CAPABILITY := $(strip $(GPU_COMPUTE_CAPABILITY)) + endif + endif +endif + +# set to defaults if - make GPU_COMPUTE_CAPABILITY= otherwise use the compute capability detected above +ifneq ($(GPU_COMPUTE_CAPABILITY),) + NVCC_FLAGS += --generate-code arch=compute_$(GPU_COMPUTE_CAPABILITY),code=[compute_$(GPU_COMPUTE_CAPABILITY),sm_$(GPU_COMPUTE_CAPABILITY)] +endif + # autodect a lot of various supports on current platform $(info ---------------------------------------------) @@ -67,27 +91,44 @@ else endif # Check and include cudnn if available -# Currently hard-coding a bunch of stuff here for Linux, todo make this better/nicer -# You need cuDNN from: https://developer.nvidia.com/cudnn -# Follow the apt-get instructions -# And the cuDNN front-end from: https://github.com/NVIDIA/cudnn-frontend/tree/main -# For this there is no installation, just download the repo to your home directory -# and then we include it below (see currently hard-coded path assumed in home directory) +# You can override the path to cudnn frontend by setting CUDNN_FRONTEND_PATH on the make command line +# By default, we look for it in HOME/cudnn-frontend/include and ./cudnn-frontend/include +# Refer to the README for cuDNN install instructions ifeq ($(USE_CUDNN), 1) ifeq ($(SHELL_UNAME), Linux) - # hard-coded path for now - CUDNN_FRONTEND_PATH := $(HOME)/cudnn-frontend/include - ifeq ($(shell [ -d $(CUDNN_FRONTEND_PATH) ] && echo "exists"), exists) + ifeq ($(shell [ -d $(HOME)/cudnn-frontend/include ] && echo "exists"), exists) + $(info ✓ cuDNN found, will run with flash-attention) + CUDNN_FRONTEND_PATH ?= $(HOME)/cudnn-frontend/include + else ifeq ($(shell [ -d cudnn-frontend/include ] && echo "exists"), exists) $(info ✓ cuDNN found, will run with flash-attention) + CUDNN_FRONTEND_PATH ?= cudnn-frontend/include + else + $(error ✗ cuDNN not found. See the README for install instructions and the Makefile for hard-coded paths) + endif + NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH) + NVCC_LDFLAGS += -lcudnn + NVCC_FLAGS += -DENABLE_CUDNN + NVCC_CUDNN = cudnn_att.o + else + ifneq ($(OS), Windows_NT) + $(info → cuDNN is not supported on MAC OS right now) + else + $(info ✓ Windows cuDNN found, will run with flash-attention) + ifeq ($(shell if exist "$(HOMEDRIVE)$(HOMEPATH)\cudnn-frontend\include" (echo exists)),exists) + CUDNN_FRONTEND_PATH ?= $(HOMEDRIVE)$(HOMEPATH)\cudnn-frontend\include #override on command line if different location + else ifeq ($(shell if exist "cudnn-frontend\include" (echo exists)),exists) + CUDNN_FRONTEND_PATH ?= cudnn-frontend\include #override on command line if different location + else + $(error ✗ cuDNN not found. See the README for install instructions and the Makefile for hard-coded paths) + endif + CUDNN_INCLUDE_PATH ?= -I"C:\Program Files\NVIDIA\CUDNN\v9.1\include\12.4" + CUDNN_FRONTEND_PATH += $(CUDNN_INCLUDE_PATH) + NVCC_FLAGS += --std c++20 -Xcompiler "/std:c++20" -Xcompiler "/EHsc /W0 /nologo /Ox /FS" -maxrregcount=0 --machine 64 + NVCC_CUDNN = cudnn_att.obj NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH) - NVCC_LDFLAGS += -lcudnn + NVCC_LDFLAGS += -L"C:\Program Files\NVIDIA\CUDNN\v9.1\lib\12.4\x64" -lcudnn NVCC_FLAGS += -DENABLE_CUDNN - NVCC_CUDNN = cudnn_att.o - else - $(error ✗ cuDNN not found. See the Makefile for our currently hard-coded paths / install instructions) endif - else - $(info → cuDNN is not supported right now outside of Linux) endif else $(info → cuDNN is manually disabled by default, run make with `USE_CUDNN=1` to try to enable) @@ -183,7 +224,7 @@ ifeq ($(NVCC),) $(info ✗ nvcc not found, skipping GPU/CUDA builds) else $(info ✓ nvcc found, including GPU/CUDA support) - TARGETS += train_gpt2cu test_gpt2cu train_gpt2fp32cu test_gpt2fp32cu + TARGETS += train_gpt2cu test_gpt2cu train_gpt2fp32cu test_gpt2fp32cu $(NVCC_CUDNN) endif $(info ---------------------------------------------) @@ -191,28 +232,28 @@ $(info ---------------------------------------------) all: $(TARGETS) train_gpt2: train_gpt2.c - $(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $< $(LDLIBS) $(OUTPUT_FILE) + $(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $^ $(LDLIBS) $(OUTPUT_FILE) test_gpt2: test_gpt2.c - $(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $< $(LDLIBS) $(OUTPUT_FILE) + $(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $^ $(LDLIBS) $(OUTPUT_FILE) -cudnn_att.o: cudnn_att.cu - $(NVCC) -c $(NVCC_FLAGS) $(PFLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) +$(NVCC_CUDNN): cudnn_att.cpp + $(NVCC) -c $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_INCLUDES) train_gpt2cu: train_gpt2.cu $(NVCC_CUDNN) - $(NVCC) $(NVCC_FLAGS) $(PFLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) $(NVCC_CUDNN) + $(NVCC) $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) train_gpt2fp32cu: train_gpt2_fp32.cu - $(NVCC) $(NVCC_FLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) + $(NVCC) $(NVCC_FLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) test_gpt2cu: test_gpt2.cu $(NVCC_CUDNN) - $(NVCC) $(NVCC_FLAGS) $(PFLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) $(NVCC_CUDNN) + $(NVCC) $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) test_gpt2fp32cu: test_gpt2_fp32.cu - $(NVCC) $(NVCC_FLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) + $(NVCC) $(NVCC_FLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) profile_gpt2cu: profile_gpt2.cu $(NVCC_CUDNN) - $(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) $(NVCC_CUDNN) + $(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) clean: - $(REMOVE_FILES) $(TARGETS) + $(REMOVE_FILES) $(TARGETS) $(NVCC_CUDNN) diff --git a/README.md b/README.md index dbb99e030..7b9c2d4fe 100644 --- a/README.md +++ b/README.md @@ -12,27 +12,27 @@ The "I don't care about anything I just want to train and I have a GPU" section. ```bash pip install -r requirements.txt -python prepro_tinyshakespeare.py +python dev/data/tinyshakespeare.py python train_gpt2.py make train_gpt2fp32cu ./train_gpt2fp32cu ``` -The above lines (1) download the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset, tokenize it with the GPT-2 Tokenizer, (2) download and save the GPT-2 (124M) weights, (3) init from them in C/CUDA and train for one epoch on tineshakespeare with AdamW (using batch size 4, context length 1024, total of 74 steps), evaluate validation loss, and sample some text. Note that in this quickstart we are using the fp32 version [train_gpt2_fp32.cu](train_gpt2_fp32.cu) of the CUDA code. Below in the CUDA section we document the current "mainline" [train_gpt2.cu](train_gpt2.cu), which is still being very actively developed, uses mixed precision, and runs ~2X faster. +The above lines (1) download the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset, tokenize it with the GPT-2 Tokenizer, (2) download and save the GPT-2 (124M) weights, (3) init from them in C/CUDA and train for one epoch on tineshakespeare with AdamW (using batch size 4, context length 1024, total of 74 steps), evaluate validation loss, and sample some text. Note that in this quickstart we are using the fp32 version [train_gpt2_fp32.cu](train_gpt2_fp32.cu) of the CUDA code. In the next section we document the current "mainline" [train_gpt2.cu](train_gpt2.cu), which uses mixed precision, and runs ~2X faster. ## quick start (GPU, fast bleeding edge) -I want to see it go fast. In this case switch to our mainline, most optimized `train_gpt2.cu` and also turn on flash attention. Run: +I want to see it go fast. In this case switch to our mainline, most optimized `train_gpt2.cu`. Run: ```bash pip install -r requirements.txt -python prepro_tinyshakespeare.py +python dev/data/tinyshakespeare.py python train_gpt2.py make train_gpt2cu ./train_gpt2cu ``` -If you additionally install cuDNN (see `Makefile` for instructions), you can also go faster with flash attention +If you additionally install cuDNN (see the CUDA section below), you can go even faster with flash attention. Adjust the make command as follows to compile with cudnn / flash attention: ```bash make train_gpt2cu USE_CUDNN=1 @@ -45,75 +45,102 @@ Note that the default batch size is very low (4). If you have enough memory on y ./train_gpt2cu -b 32 ``` -My standard "prod" run with a nice GPU (e.g. A100 40GB) actually trains on TinyStories instead of TinyShakespeare, and looks like this: +My standard single-GPU "prod" run (e.g. with a A100 40GB) trains on TinyStories instead of TinyShakespeare and looks like this, as an example: ```bash -python prepro_tinystories.py +python dev/data/tinystories.py make train_gpt2cu USE_CUDNN=1 -./train_gpt2cu -i data/TinyStories -v 250 -s 250 -g 144 -o stories.log -b 32 +./train_gpt2cu -i dev/data/tinystories/TinyStories_train.bin \ + -j dev/data/tinystories/TinyStories_val.bin \ + -v 250 -s 250 -g 144 -o stories.log -b 32 ``` -Where I decrease the frequency of validation loss and sampling to every 250 steps, sample 144 tokens during sampling stage (to fit ~one story), and at batch size 32. +The `-i` flag is a glob pattern for the input data, `-j` for the val data. In addition I decrease the frequency of validation loss and sampling to every 250 steps, sample 144 tokens during sampling stage (to fit ~one story), and at batch size 32. -## quick start (CPU) - -The "I am so GPU poor that I don't even have one" section. No worries, run: +If you want to train on actual, real pretraining data, check out the recently added support for [fineweb dataset](https://huggingface.co/datasets/HuggingFaceFW/fineweb). Unlike the datasets above where the train/val tokens fit into a single .bin file, we now have multiple data shards as well. Here is an example: -```bash -pip install -r requirements.txt -python prepro_tinyshakespeare.py -python train_gpt2.py -make train_gpt2 -OMP_NUM_THREADS=8 ./train_gpt2 +``` +# write fineweb data in 100M token shards to dev/data/fineweb10B +python dev/data/fineweb.py -s 100000000 +# compile and run +./train_gpt2cu -i "dev/data/fineweb10B/fineweb_train_*.bin" \ + -j "dev/data/fineweb10B/fineweb_val_*.bin" \ + -v 250 -s 250 -g 144 -o fineweb.log -b 32 ``` -The above lines (1) download the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset, tokenize it with the GPT-2 Tokenizer, (2) download and save the GPT-2 (124M) weights, (3) init from them in C and train for 40 steps on tineshakespeare with AdamW (using batch size 4, context length only 64), evaluate validation loss, and sample some text. Honestly, unless you have a beefy CPU (and can crank up the number of OMP threads in the launch command), you're not going to get that far on CPU training LLMs, but it might be a good demo/reference. +Where you will notice the use of glob pattern `*` to match all the train shards. ## quick start (multiple GPUs) -You'll be using the (more bleeding edge) mixed precision version of the code: +Great, let's get even more serious. We're using MPI and NCCL for multi-GPU training. Everything in the section above applies, with the following changes: -``` +```bash +# example to install MPI: sudo apt install openmpi-bin openmpi-doc libopenmpi-dev +# the run command is now preceeded by `mpirun`: +mpirun -np ./train_gpt2cu +``` + +Sub in the number of GPUs you'd like to run on in the last command. All of the flags discussed in the section above apply here as well. + +## quick start (CPU) + +The "I am so GPU poor that I don't even have one" section. You can still train! But you won't go too far. You can still finetune a GPT-2 small (124M parameter model) to output Shakespeare-like text, as an example: + +```bash pip install -r requirements.txt -python prepro_tinyshakespeare.py +python dev/data/tinyshakespeare.py python train_gpt2.py -make train_gpt2cu -mpirun -np ./train_gpt2cu +make train_gpt2 +OMP_NUM_THREADS=8 ./train_gpt2 ``` -Sub in the number of GPUs you'd like to run on in the last command. +The above lines (1) download the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset, tokenize it with the GPT-2 Tokenizer, (2) download and save the GPT-2 (124M) weights, (3) init from them in C and train for 40 steps on tineshakespeare with AdamW (using batch size 4, context length only 64), evaluate validation loss, and sample some text. Honestly, unless you have a beefy CPU (and can crank up the number of OMP threads in the launch command), you're not going to get that far on CPU training LLMs, but it might be a good demo/reference. ## training: more detail -Download and tokenize a dataset. The [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset is the fastest to download and tokenize: +The data files inside `/dev/data/(dataset).py` are responsible for downloading, tokenizing and saving the tokens to file. So for example when you run: ```bash -python prepro_tinyshakespeare.py +python dev/data/tinyshakespeare.py ``` -This prints: +We download and tokenize the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset. The output of this looks like this: ``` -Saved 32768 tokens to data/tiny_shakespeare_val.bin -Saved 305260 tokens to data/tiny_shakespeare_train.bin +writing 32,768 tokens to ./dev/data/tinyshakespeare/tiny_shakespeare_val.bin +writing 305,260 tokens to ./dev/data/tinyshakespeare/tiny_shakespeare_train.bin ``` -The .bin files are raw byte streams of int32 numbers indicating the token ids with the GPT-2 tokenizer. Alternatively you could also tokenize the [TinyStories](https://huggingface.co/datasets/roneneldan/TinyStories) dataset with `prepro_tinystories.py`. +The .bin files contain a short header (1024 bytes) and then a stream of tokens in uint16, indicating the token ids with the GPT-2 tokenizer. More datasets are available in `/dev/data`. -In principle we'd be ready to train the model right here. However the baseline CPU/fp32 reference code is so inefficient that it's not practical to train these models from scratch yet. Instead, we initialize with the GPT-2 weights released by OpenAI and just do finetuning. For that, we have to download the GPT-2 weights and save them as a checkpoint we can load in C: +In principle, once we have the tokens, we'd be ready to train the model right here. However, current code can't start training from scratch just yet (coming very soon), so we initialize training from the pretrained models released by OpenAI and do finetuning. For that, we have to download the GPT-2 weights and save them as a checkpoint we can load in C. This is what happens when you run this script: ```bash python train_gpt2.py ``` -You'll recognize this code from nanoGPT as a simple GPT-2 reference implementation in PyTorch. This script will download the GPT-2 (124M) model, overfit a single batch of data for 10 iterations, run a few steps of generation, and most importantly it will save three files: 1) the `gpt2_124M.bin` file that contains the raw model weights for loading in C, 2) the `gpt2_124M_debug_state.bin`, which also contains more debug state: the inputs, targets, logits and loss (useful for debugging and unit testing), and finally 3) the `gpt2_tokenizer.bin` which stores the vocabulary for the GPT-2 tokenizer, translating token ids to byte sequences of UTF-8 encoded string pieces. We can now initialize with these model weights and continue training in raw C. First compile the code: +You'll recognize this code from nanoGPT as a simple GPT-2 reference implementation in PyTorch. This script will download the GPT-2 (124M) model, overfit a single batch of data for 10 iterations, run a few steps of generation, and most importantly it will save three files: 1) the `gpt2_124M.bin` file that contains the raw model weights for loading in C, 2) the `gpt2_124M_debug_state.bin`, which also contains more debug state: the inputs, targets, logits and loss (useful for debugging and unit testing), and finally 3) the `gpt2_tokenizer.bin` which stores the vocabulary for the GPT-2 tokenizer, translating token ids to byte sequences of UTF-8 encoded string pieces. The file also saves both the fp32 versions of the above, and the bfloat16 versions of them for mixed precision training. We can now initialize with these model weights and continue training in raw C. Then we compile the training programs with `make`. There are currently three parallel implementations: ```bash +# the simple, CPU, reference code version make train_gpt2 +# the single-GPU fp32 CUDA version +make train_gpt2fp32cu +# the multi-GPU mixed precision CUDA version +make train_gpt2cu ``` -You can have a look inside the `Makefile` and its comments. It will try to autodetect if OpenMP is available on your system, which is very helpful for speeding up the code at very low cost of code complexity. Some people seem to experience problems compiling on Ubuntu, have a look at [Issue 19](https://github.com/karpathy/llm.c/issues/19), TLDR you'd want to modify the `CFLAGS`: +You can have a look inside the `Makefile` and its comments. It will try to autodetect a lot of tools and libraries (e.g. cuDNN, OpenMP, OpenMPI, nvcc), and you want to get as many checkmarks as possible. For example when I run `make train_gpt2cu USE_CUDNN=1` on my fully configured machine, we see: + +``` +✓ cuDNN found, will run with flash-attention +✓ OpenMP found +✓ OpenMPI found, OK to train with multiple GPUs +✓ nvcc found, including GPU/CUDA support +``` + +Some people seem to experience problems compiling on Ubuntu, have a look at [Issue 19](https://github.com/karpathy/llm.c/issues/19), TLDR you'd want to modify the `CFLAGS`: ``` # try this first @@ -122,7 +149,7 @@ CFLAGS="-Ofast -fno-finite-math-only -Wno-unused-result -march=native" make trai CFLAGS="-O3 -Wno-unused-result -march=native" make train_gpt2 ``` -Once `train_gpt2` is compiled, you can run it: +Once the binary is compiled, we can run it. For example the simplest CPU reference version runs as: ```bash OMP_NUM_THREADS=8 ./train_gpt2 @@ -164,18 +191,27 @@ Allay --- ``` -I like how Netflix comes up, it's clear that the shadow of the training past is still lurking in the model. I did not attempt to tune the finetuning hyperparameters so it's quite likely this can be improved quite a bit. I also noticed that slightly different platforms (e.g. MacOS / Linux) will (sadly) give very slightly different results, so perhaps don't expect to get the exact numbers or generation above. Also note that if you are seeing token ids instead of text in the generation, it might be because your code is out of date, as Tokenizer decoding was added April 14, 2024. `git pull` the updates, and then re-run `python train_gpt2.py`, which will now also save the tokenizer, which C can read and then use to print text instead of token ids. +I like how Netflix comes up, it's clear that the shadow of the training past is still lurking in the model. I did not attempt to tune the finetuning hyperparameters so it's quite likely this can be improved quite a bit. I also noticed that slightly different platforms (e.g. MacOS / Linux) will (sadly) give very slightly different results, so perhaps don't expect to get the exact numbers or generation above. + +Finally, the code is in flux. If anything weird happens that you didn't expect or that worked previously, try to `git pull`, re-run all the commands above, reference back to this README, etc. ## test -I am also attaching a simple unit test for making sure our C code agrees with the PyTorch code. Compile and run with: +I am also attaching a simple unit test for making sure our C code agrees with the PyTorch code. On the CPU as an example, compile and run with: ```bash make test_gpt2 ./test_gpt2 ``` -This now loads the `gpt2_124M_debug_state.bin` file, runs a forward pass, compares the logits and loss with the PyTorch reference implementation, then it does 10 iterations of training with Adam and makes sure the losses match PyTorch. +This now loads the `gpt2_124M_debug_state.bin` file, runs a forward pass, compares the logits and loss with the PyTorch reference implementation, then it does 10 iterations of training with Adam and makes sure the losses match PyTorch. To test the GPU version I run: + +```bash +# fp32 test (cudnn not supported) +make test_gpt2cu PRECISION=FP32 && ./test_gpt2cu +# mixed precision cudnn test +make test_gpt2cu USE_CUDNN=1 && ./test_gpt2cu +``` ## tutorial @@ -183,7 +219,7 @@ I attached a very small tutorial here, in [doc/layernorm/layernorm.md](doc/layer ## CUDA -The full training loop is also implemented in pure CUDA in one file, but optimizations of the kernels are ongoing. Currently, we roughly match the speed of PyTorch. The way we organize code is that we have a growing collection of kernels of increasing complexity in the `dev/cuda` folder, see [dev/cuda/README.md](dev/cuda/README.md). We then copy paste the best kernels into the main training loop in the single training file `train_gpt2cu.cu`. +The full training loop is also implemented in pure CUDA in one file, but optimizations of the kernels are ongoing. Currently, we slightly exceed the speed of PyTorch Nightly. The way we organize code is that we have a growing collection of kernels of increasing complexity in the `dev/cuda` folder, see [dev/cuda/README.md](dev/cuda/README.md). We then copy paste the best kernels into the main training loop in the single training file `train_gpt2cu.cu`. **WIP alert, April 23**. We merged the first version of mixed precision training code. I checkpointed the fp32 version to separate files that include `_fp32` in their filename, and would like to preserve this version in the root of the repo because it 1) doesn't require the most up to date CUDA and will a lot more likely compile and is more portable, 2) it is a lot simpler and acts as reference. In fact, we'd like to diverge the fp32 version in the direction of being pure CUDA (e.g. do not even call cuBLAS by default), to be used as an educational reference, maybe even a kernel of a course on CUDA. The "mainline" development concerned with speed will from there on move to the [train_gpt2.cu](train_gpt2.cu) file, which includes mixed precision training. @@ -198,7 +234,7 @@ make test_gpt2fp32cu This prints `overall okay: 1`. So the forward activations, backward gradients, and the individual loss values for 10 iterations all match exactly. -**Training**. To train GPT-2 in a single file of CUDA, run the train script: +**Training**. To train on single GPU in fp32: ```bash make train_gpt2fp32cu @@ -228,9 +264,7 @@ For on his rock shall he be opencast. Keep on with me, my ``` -This runs on my A100 in about ~10 seconds. This training loop in the PyTorch script is about 80ms/iteration, so we are slightly better than PyTorch here. However, this is measured with PyTorch that is a bit stale (I'm on 2.1.0) and we're not yet including FlashAttention or the PyTorch scaled_dot_product_attention fused operation. - -We can compare to naive PyTorch like this, where we turn on `torch.compile` and the use of TensorCores, which use tf32 type: +This runs on my A100 in about ~10 seconds. We can compare to naive PyTorch like this, where we turn on `torch.compile` and the use of TensorCores, which use tf32 type: ```bash python train_gpt2.py --write_tensors 0 --sequence_length 1024 --batch_size 4 --compile 1 --tensorcores 1 @@ -256,7 +290,16 @@ If you have the latest CUDA you should expect this to compile OK, and you should make train_gpt2cu USE_CUDNN=1 ``` -This will try to compile with cudnn and run it. You have to have cuDNN installed on your system. Follow the [cuDNN installation instructions](https://developer.nvidia.com/cudnn) to install cuDNN with apt-get. On top of this you need the [cuDNN frontend](https://github.com/NVIDIA/cudnn-frontend/tree/main), but this is just header files. So simply download the repo to your disk, currently assumed to be in your home directory (i.e. the Makefile looks for `~/cudnn-frontend/include`). +This will try to compile with cudnn and run it. You have to have cuDNN installed on your system. The [cuDNN installation instructions](https://developer.nvidia.com/cudnn) with apt-get will grab the default set of cuDNN packages. For a minimal setup, the cuDNN dev package is sufficient, e.g. on Ubuntu 22.04 for CUDA 12.x: + +```bash +wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb +sudo dpkg -i cuda-keyring_1.1-1_all.deb +sudo apt-get update +sudo apt-get -y install libcudnn9-dev-cuda-12 +``` + +On top of this you need the [cuDNN frontend](https://github.com/NVIDIA/cudnn-frontend/tree/main), but this is just header files. Simply clone the repo to your disk. The Makefile currently looks for it in either your home directory or the current directory. If you have put it elsewhere, add `CUDNN_FRONTEND_PATH=/path/to/your/cudnn-frontend/include` to the `make` command-line. **Multi-GPU training**. As of April 26, 2024 there is now also support for multi-GPU training using MPI and NCCL. Make sure you install MPI, e.g. on Linux: @@ -333,8 +376,12 @@ Lastly, I will be a lot more sensitive to complexity in the root folder of the p ## notable forks +- AMD support + - [llm.c](https://github.com/anthonix/llm.c) by @[anthonix](https://github.com/anthonix): support for AMD devices, such as the 7900 XTX + - C# - [llm.cs](https://github.com/azret/llm.cs) by @[azret](https://github.com/azret): a C# port of this project + - [Llm.cs](https://github.com/nietras/Llm.cs) by @[nietras](https://github.com/nietras): a C# port of this project with focus on easy to get started on any platform. Clone and run ✅ - CUDA C++ - [llm.cpp](https://github.com/gevtushenko/llm.c) by @[gevtushenko](https://github.com/gevtushenko): a port of this project using the [CUDA C++ Core Libraries](https://github.com/NVIDIA/cccl) @@ -353,12 +400,20 @@ Lastly, I will be a lot more sensitive to complexity in the root folder of the p - [llm.🔥](https://github.com/dorjeduck/llm.mojo) by @[dorjeduck](https://github.com/dorjeduck): a Mojo port of this project - Rust + - [llm.rs](https://github.com/yijunyu/llm.rs) by @[Yijun Yu](https://github.com/yijunyu): a Rust rewrite with the aim to have same performance - [llm.rs](https://github.com/ToJen/llm.rs) by @[ToJen](https://github.com/ToJen): a Rust port of this project +- Swift + - [llm.swift](https://github.com/otabuzzman/llm.swift) by @[otabuzzman](https://github.com/otabuzzman): a Swift port of this project + - Zig - [llm.zig](https://github.com/Saimirbaci/llm.zig) by @[saimirbaci](https://github.com/Saimirbaci): a Zig port of this project +## major changes log + +- **May 21, 2024: Dataset refactor**. I refactored the .bin files that hold the tokens to include a header like all the other .bin files that e.g. store the model weights. This was necessary to support multiple versions and future development. Unfortunately, this will brick everyone's master the next time you `git pull`, because the .bin files you've generated before are the legacy version. To fix this, you only have to re-generate the data in the new format. For example, for Tiny Shakespeare run: `python dev/data/tinyshakespeare.py`. For Tiny Stories, `python dev/data/tinystories.py`. Also notice that the location of these data files has changed. They used to just be "flat" and inside `data/` folder, but now all the data-related code was moved to `dev/data` files and sub-directories, to keep things organized. Apologies for breaking change, I'll try not to brick master too much in general. + ## discussions Ways of organizing development: diff --git a/cudnn_att.cu b/cudnn_att.cpp similarity index 58% rename from cudnn_att.cu rename to cudnn_att.cpp index 2735bbd14..04b1a92ec 100644 --- a/cudnn_att.cu +++ b/cudnn_att.cpp @@ -5,18 +5,22 @@ #include #include #include +namespace fe = cudnn_frontend; // Specific configurations based on the enabled precision #if defined(ENABLE_FP32) typedef float floatX; +static_assert(false, "cuDNN is not supported in FP32 mode.") // use fp16 (note: this may require gradient scaler, currently not implemented!) #elif defined(ENABLE_FP16) typedef half floatX; #define CUBLAS_LOWP CUDA_R_16F +#define CUDNN_16BIT fe::DataType_t::HALF #else // Default to bfloat16 typedef __nv_bfloat16 floatX; +#define CUDNN_16BIT fe::DataType_t::BFLOAT16 #endif // CUDA error checking @@ -34,24 +38,15 @@ namespace { class NvtxRange { public: NvtxRange(const char* s) { nvtxRangePush(s); } - NvtxRange(const std::string& base_str, int number) { std::string range_string = base_str + " " + std::to_string(number); nvtxRangePush(range_string.c_str()); } - ~NvtxRange() { nvtxRangePop(); } }; } #define NVTX_RANGE_FN() NvtxRange nvtx_range(__FUNCTION__) -namespace fe = cudnn_frontend; -#if CUBLAS_LOWP == CUDA_R_16BF -#define CUDNN_16BIT fe::DataType_t::BFLOAT16 -#else -#define CUDNN_16BIT fe::DataType_t::HALF -#endif - static cudnnHandle_t cudnn_handle; static size_t cudnn_workspace_size = 0; // dynamically allocated as needed (up to 256MiB!) static void* cudnn_workspace = NULL; @@ -65,62 +60,59 @@ static void checkCudnnFE(fe::error_object e, const char *file, int line) { } #define checkCudnnFE(err) checkCudnnFE(err, __FILE__, __LINE__) -using graph_tensors_fwd = std::tuple, - std::shared_ptr, // Q, - std::shared_ptr, // K, - std::shared_ptr, // V, - std::shared_ptr, // Attn_scale, - std::shared_ptr, // O - std::shared_ptr // Stats ->; - -using graph_tensors_bwd = std::tuple, - std::shared_ptr, // Q, - std::shared_ptr, // K, - std::shared_ptr, // V, - std::shared_ptr, // O - std::shared_ptr, // dO - std::shared_ptr, // Stats - std::shared_ptr, // Attn_scale, - std::shared_ptr, // dQ, - std::shared_ptr, // dK, - std::shared_ptr // dV ->; +enum UIDs { + Q_UID, + K_UID, + V_UID, + Attn_scale_UID, + O_UID, + Stats_UID, + dO_UID, + dQ_UID, + dK_UID, + dV_UID +}; // Need a cache because graph->build_operation_graph() is slow but everything else seems fast -using cache_type_fwd = std::unordered_map; -using cache_type_bwd = std::unordered_map; +using cache_type_fwd = std::map, std::shared_ptr>; +using cache_type_bwd = std::map, std::shared_ptr>; // Loosely based on cuDNN frontend samples functions and massively simplified -template -auto lookup_cache_or_build_graph_fwd(Args... args) { +auto lookup_cache_or_build_graph_fwd(int B,int H,int T,int HS, int is_inference_only) { + static cache_type_fwd user_maintained_cache_fwd; - auto [B, H, T, HS, is_inference_only] = std::make_tuple(args...); + auto key = std::make_tuple(B, H, T, HS, is_inference_only); + + auto it = user_maintained_cache_fwd.find(key); + if (it != user_maintained_cache_fwd.end()) { + return it->second; + } + auto graph = std::make_shared(); graph->set_io_data_type(CUDNN_16BIT) - .set_intermediate_data_type(fe::DataType_t::FLOAT) - .set_compute_data_type(fe::DataType_t::FLOAT); + .set_intermediate_data_type(fe::DataType_t::FLOAT) + .set_compute_data_type(fe::DataType_t::FLOAT); // QKV is (B, T, 3, NH, HS) which cuDNN can handle directly without an external permute - auto Q = graph->tensor(fe::graph::Tensor_attributes() - .set_name("Q") + auto Q = graph->tensor(fe::graph::Tensor_attributes().set_name("Q") .set_dim({B, H, T, HS}) + .set_uid(Q_UID) .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1})); - auto K = graph->tensor(fe::graph::Tensor_attributes() - .set_name("K") + auto K = graph->tensor(fe::graph::Tensor_attributes().set_name("K") .set_dim({B, H, T, HS}) + .set_uid(K_UID) .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1})); - auto V = graph->tensor(fe::graph::Tensor_attributes() - .set_name("V") + auto V = graph->tensor(fe::graph::Tensor_attributes().set_name("V") .set_dim({B, H, T, HS}) + .set_uid(V_UID) .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1})); - auto attn_scale = graph->tensor(fe::graph::Tensor_attributes() - .set_name("attn_scale") - .set_dim({1, 1, 1, 1}) - .set_stride({1, 1, 1, 1}) - .set_is_pass_by_value(true) - .set_data_type(fe::DataType_t::FLOAT)); + auto attn_scale = graph->tensor(fe::graph::Tensor_attributes().set_name("attn_scale") + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_uid(Attn_scale_UID) + .set_is_pass_by_value(true) + .set_data_type(fe::DataType_t::FLOAT)); auto sdpa_options = fe::graph::SDPA_attributes().set_name("flash_attention"); sdpa_options.set_is_inference(is_inference_only); @@ -131,95 +123,99 @@ auto lookup_cache_or_build_graph_fwd(Args... args) { auto [O, stats] = graph->sdpa(Q, K, V, sdpa_options); // Output is (B, T, NH, HS) BF16/FP16 and stats for backward pass is (B, NH, T) FP32 - O->set_output(true).set_dim({B, H, T, HS}).set_stride({H * HS * T, HS, H * HS, 1}); + O->set_output(true).set_dim({B, H, T, HS}).set_stride({H * HS * T, HS, H * HS, 1}).set_uid(O_UID); assert(stats == nullptr || is_inference_only == false); if (is_inference_only == false) { stats->set_output(true).set_data_type(fe::DataType_t::FLOAT) - .set_dim({B, H, T, 1}) - .set_stride({H * T, T, 1, 1}); + .set_dim({B, H, T, 1}) + .set_stride({H * T, T, 1, 1}) + .set_uid(Stats_UID); } checkCudnnFE(graph->validate()); - auto key = graph->key(); - auto it = user_maintained_cache_fwd.find(key); - if (it != user_maintained_cache_fwd.end()) { - return it->second; - } // Build the operation graph and execution part (this is the VERY SLOW PART) checkCudnnFE(graph->build_operation_graph(cudnn_handle)); auto plans = graph->create_execution_plans({fe::HeurMode_t::A}); checkCudnnFE(graph->check_support(cudnn_handle)); checkCudnnFE(graph->build_plans(cudnn_handle)); + // Reallocate the workspace if the required size is greater than the current workspace + // In H100 this may be around 16B + if (graph->get_workspace_size() > cudnn_workspace_size) { + if (cudnn_workspace_size > 0) { + cudaCheck(cudaFree(cudnn_workspace)); + } + cudnn_workspace_size = graph->get_workspace_size(); + cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size)); + } - auto tuple = std::make_tuple(graph, Q, K, V, attn_scale, O, stats); - user_maintained_cache_fwd.insert({key, tuple}); - return tuple; + user_maintained_cache_fwd.insert({key, graph}); + + return graph; } -template -auto lookup_cache_or_build_graph_bwd(Args... args) { +auto lookup_cache_or_build_graph_bwd(int B, int NH, int T, int HS) { static cache_type_bwd user_maintained_cache_bwd; - auto [B, NH, T, HS] = std::make_tuple(args...); + + auto key = std::make_tuple(B, NH, T, HS); + + auto it = user_maintained_cache_bwd.find(key); + if (it != user_maintained_cache_bwd.end()) { + return it->second; + } auto graph = std::make_shared(); graph->set_io_data_type(CUDNN_16BIT) - .set_intermediate_data_type(fe::DataType_t::FLOAT) - .set_compute_data_type(fe::DataType_t::FLOAT); + .set_intermediate_data_type(fe::DataType_t::FLOAT) + .set_compute_data_type(fe::DataType_t::FLOAT); // (B, N, 3, NH, HS) // must come from inp (which means we also need to convert THAT to FP16) - auto Q = graph->tensor(fe::graph::Tensor_attributes() - .set_name("Q") - .set_dim({B, NH, T, HS}) - .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1})); - auto K = graph->tensor(fe::graph::Tensor_attributes() - .set_name("K") - .set_dim({B, NH, T, HS}) - .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1})); - auto V = graph->tensor(fe::graph::Tensor_attributes() - .set_name("V") - .set_dim({B, NH, T, HS}) - .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1})); - auto O = graph->tensor(fe::graph::Tensor_attributes() - .set_name("O") - .set_dim({B, NH, T, HS}) - .set_stride({NH * HS * T, HS, NH * HS, 1})); - auto dO = graph->tensor(fe::graph::Tensor_attributes() - .set_name("dO") - .set_dim({B, NH, T, HS}) - .set_stride({NH * HS * T, HS, NH * HS, 1})); - - auto stats = graph->tensor(fe::graph::Tensor_attributes() - .set_name("stats") - .set_dim({B, NH, T, 1}) - .set_stride({NH * T, T, 1, 1}) - .set_data_type(fe::DataType_t::FLOAT)); - auto attn_scale = graph->tensor(fe::graph::Tensor_attributes() - .set_name("attn_scale") - .set_dim({1, 1, 1, 1}) - .set_stride({1, 1, 1, 1}) - .set_is_pass_by_value(true) - .set_data_type(fe::DataType_t::FLOAT)); - auto sdpa_backward_options = fe::graph::SDPA_backward_attributes() - .set_name("flash_attention_backward") - .set_causal_mask(true) - .set_attn_scale(attn_scale); + auto Q = graph->tensor(fe::graph::Tensor_attributes().set_name("Q") + .set_dim({B, NH, T, HS}) + .set_uid(Q_UID) + .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1})); + auto K = graph->tensor(fe::graph::Tensor_attributes().set_name("K") + .set_dim({B, NH, T, HS}) + .set_uid(K_UID) + .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1})); + auto V = graph->tensor(fe::graph::Tensor_attributes().set_name("V") + .set_dim({B, NH, T, HS}) + .set_uid(V_UID) + .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1})); + auto O = graph->tensor(fe::graph::Tensor_attributes().set_name("O") + .set_dim({B, NH, T, HS}) + .set_uid(O_UID) + .set_stride({NH * HS * T, HS, NH * HS, 1})); + auto dO = graph->tensor(fe::graph::Tensor_attributes().set_name("dO") + .set_dim({B, NH, T, HS}) + .set_uid(dO_UID) + .set_stride({NH * HS * T, HS, NH * HS, 1})); + + auto stats = graph->tensor(fe::graph::Tensor_attributes().set_name("stats") + .set_dim({B, NH, T, 1}) + .set_uid(Stats_UID) + .set_stride({NH * T, T, 1, 1}) + .set_data_type(fe::DataType_t::FLOAT)); + auto attn_scale = graph->tensor(fe::graph::Tensor_attributes().set_name("attn_scale") + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_is_pass_by_value(true) + .set_uid(Attn_scale_UID) + .set_data_type(fe::DataType_t::FLOAT)); + auto sdpa_backward_options = fe::graph::SDPA_backward_attributes().set_name("flash_attention_backward") + .set_causal_mask(true) + .set_attn_scale(attn_scale); // Create the graph operation and get the output tensors back auto [dQ, dK, dV] = graph->sdpa_backward(Q, K, V, O, dO, stats, sdpa_backward_options); - dQ->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}); - dK->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}); - dV->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}); + dQ->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}).set_uid(dQ_UID); + dK->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}).set_uid(dK_UID); + dV->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}).set_uid(dV_UID); checkCudnnFE(graph->validate()); - auto key = graph->key(); - auto it = user_maintained_cache_bwd.find(key); - if (it != user_maintained_cache_bwd.end()) { - return it->second; - } // Build the operation graph and execution part (this is the VERY SLOW PART) checkCudnnFE(graph->build_operation_graph(cudnn_handle)); @@ -227,9 +223,18 @@ auto lookup_cache_or_build_graph_bwd(Args... args) { checkCudnnFE(graph->check_support(cudnn_handle)); checkCudnnFE(graph->build_plans(cudnn_handle)); - auto tuple = std::make_tuple(graph, Q, K, V, O, dO, stats, attn_scale, dQ, dK, dV); - user_maintained_cache_bwd.insert({key, tuple}); - return tuple; + // Reallocate the workspace if the required size is greater than the current workspace + // By default, cuDNN uses up to 256MiB of workspace, so we don't want to just allocate the maximum + if (graph->get_workspace_size() > cudnn_workspace_size) { + if (cudnn_workspace_size > 0) { + cudaCheck(cudaFree(cudnn_workspace)); + } + cudnn_workspace_size = graph->get_workspace_size(); + cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size)); + } + + user_maintained_cache_bwd.insert({key, graph}); + return graph; } void attention_forward_cudnn(floatX* out, // output: (B, T, NH, HS) @@ -241,8 +246,7 @@ void attention_forward_cudnn(floatX* out, // output: (B, T, NH, HS) bool is_inference_only = (stats == nullptr); // Get graph and tensors from cache (or generate it on first use) - auto [graph, Q, K, V, attn_scale, O, softmax_stats] = - lookup_cache_or_build_graph_fwd(B, NH, T, HS, is_inference_only); + auto graph = lookup_cache_or_build_graph_fwd(B, NH, T, HS, is_inference_only); // Prepare all the tensor pointers for executing the graph void* devPtrQ = inp; @@ -252,22 +256,12 @@ void attention_forward_cudnn(floatX* out, // output: (B, T, NH, HS) void* devPtrO = out; // Build variant pack - std::unordered_map, void*> variant_pack = { - {Q, devPtrQ}, {K, devPtrK}, {V, devPtrV}, {attn_scale, &attn_scale_cpu}, {O, devPtrO}}; + std::unordered_map variant_pack = { + {Q_UID, devPtrQ}, {K_UID, devPtrK}, {V_UID, devPtrV}, {Attn_scale_UID, &attn_scale_cpu}, {O_UID, devPtrO}}; // Add the stats tensor unless we are only doing inference (only needed for backward pass) if (is_inference_only == false) { - variant_pack[softmax_stats] = stats; - } - - // Reallocate the workspace if the required size is greater than the current workspace - // By default, cuDNN uses up to 256MiB of workspace, so we don't want to just allocate the maximum - if (graph->get_workspace_size() > cudnn_workspace_size) { - if (cudnn_workspace_size > 0) { - cudaCheck(cudaFree(cudnn_workspace)); - } - cudnn_workspace_size = graph->get_workspace_size(); - cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size)); + variant_pack[Stats_UID] = stats; } // Execute graph @@ -282,8 +276,7 @@ void attention_backward_cudnn(floatX* dqkvr, int HS = C / NH; // number of features per head // Get graph and tensors from cache (or generate it on first use) - auto [graph, Q, K, V, O, dO, Stats, attn_scale, dQ, dK, dV] = - lookup_cache_or_build_graph_bwd(B, NH, T, HS); + auto graph = lookup_cache_or_build_graph_bwd(B, NH, T, HS); // Prepare all the tensor pointers for executing the graph void* devPtrQ = qkvr; @@ -299,20 +292,10 @@ void attention_backward_cudnn(floatX* dqkvr, void* devPtrdV = (dqkvr + 2 * NH * HS); // Build variant pack that links each tensor to its data pointer - std::unordered_map, void*> variant_pack = { - {Q, devPtrQ}, {K, devPtrK}, {V, devPtrV}, {O, devPtrO}, {dO, devPtrdO}, {Stats, devPtrStats}, - {dQ, devPtrdQ}, {dK, devPtrdK}, {dV, devPtrdV}, - {attn_scale, &attn_scale_cpu}}; - - // Reallocate the workspace if the required size is greater than the current workspace - // By default, cuDNN uses up to 256MiB of workspace, so we don't want to just allocate the maximum - if (graph->get_workspace_size() > cudnn_workspace_size) { - if (cudnn_workspace_size > 0) { - cudaCheck(cudaFree(cudnn_workspace)); - } - cudnn_workspace_size = graph->get_workspace_size(); - cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size)); - } + std::unordered_map variant_pack = { + {Q_UID, devPtrQ}, {K_UID, devPtrK}, {V_UID, devPtrV}, {O_UID, devPtrO}, {dO_UID, devPtrdO}, {Stats_UID, devPtrStats}, + {dQ_UID, devPtrdQ}, {dK_UID, devPtrdK}, {dV_UID, devPtrdV}, + {Attn_scale_UID, &attn_scale_cpu}}; // Execute graph checkCudnnFE(graph->execute(cudnn_handle, variant_pack, cudnn_workspace)); diff --git a/dataloader.h b/dataloader.h new file mode 100644 index 000000000..6b63c34a1 --- /dev/null +++ b/dataloader.h @@ -0,0 +1,444 @@ +/* +Implements a medium simple DataLoader for a distributed training setup. +*/ +#ifndef DATALOADER_H +#define DATALOADER_H + +#include +#include +#include +#include +#include +#include +// defines: fopenCheck, freadCheck, fcloseCheck, fseekCheck +// defines: mallocCheck +#include "utils.h" + +// ---------------------------------------------------------------------------- +// implementation of glob for Windows is in dev/unistd.h +#ifndef _WIN32 +#include +#endif +// ---------------------------------------------------------------------------- +// Distributed Data Loader +#define HEADER_SIZE 256 + +typedef struct { + // variables related to distributed training + // each process/worker has to access different parts of the data + int process_rank; + int num_processes; + // hyperparameters. use size_t to prevent overflow + size_t B; + size_t T; + // input handling and its state + glob_t glob_result; // stores the result of glob, for all shards we want to iterate + int current_shard; // the current shard we are reading from + FILE* tokens_file; + int64_t file_size; + int64_t current_position; + uint16_t* buffer; // we fread data from file into this buffer + // public variables that could be accessed from outside + size_t num_batches; + int* inputs; // input tokens into transformer + int* targets; // target tokens for the transformer +} DataLoader; + +int64_t dataloader_load_shard_(DataLoader *loader, int shard_index) { + // use the first glob match as the filename for now + const char* filename = loader->glob_result.gl_pathv[shard_index]; + // open the input file for reading. also only a single file can be opened at a time + if (loader->tokens_file != NULL) { + fcloseCheck(loader->tokens_file); + } + loader->tokens_file = fopenCheck(filename, "rb"); + // validate the header + int header[HEADER_SIZE]; + freadCheck(header, sizeof(int), HEADER_SIZE, loader->tokens_file); + if (header[0] != 20240520) { + printf("Bad magic in the data file\n"); + printf("---> HINT: Are you passing in a correct file?\n"); + printf("---> HINT: The data encoding may have changed, re-run data prepro or refer again to README.\n"); + exit(EXIT_FAILURE); + } + if (header[1] != 1) { printf("Bad version in data file\n"); exit(EXIT_FAILURE); } + int64_t ntok = header[2]; // number of tokens in the file + assert(ntok > 0); // we expect some tokens in the file. this should never trip, right? + // determine the file size and make sure it is consistent with the number of tokens + fseekCheck(loader->tokens_file, 0, SEEK_END); // seek to end of file + loader->file_size = ftell(loader->tokens_file); // read the offset, i.e. file size + fseekCheck(loader->tokens_file, 0, SEEK_SET); // seek back to the beginning + // we expect ntok in the file to be consistent with filesize, assert that is the case + int64_t expected_file_size = HEADER_SIZE * sizeof(int) + ntok * sizeof(uint16_t); + if (loader->file_size != expected_file_size) { + printf("Error: file size is not as expected\n"); + exit(EXIT_FAILURE); + } + return ntok; +} + +void dataloader_reset(DataLoader *loader) { + // fully resets the DataLoader object to init configuration + // each process starts at a different offset in the file + int64_t header_bytes = HEADER_SIZE * sizeof(int); + int64_t token_bytes_offset = loader->process_rank * loader->B * loader->T * sizeof(uint16_t); + loader->current_shard = 0; + loader->current_position = header_bytes + token_bytes_offset; + dataloader_load_shard_(loader, loader->current_shard); +} + +void dataloader_advance_(DataLoader *loader) { + // advance the loader by loading the next data shard and resetting the position + if (loader->glob_result.gl_pathc > 1) { + // if we have more than one shard, advance to the next one + loader->current_shard = (loader->current_shard + 1) % loader->glob_result.gl_pathc; + dataloader_load_shard_(loader, loader->current_shard); + } + int64_t header_bytes = HEADER_SIZE * sizeof(int); + int64_t token_bytes_offset = loader->process_rank * loader->B * loader->T * sizeof(uint16_t); + loader->current_position = header_bytes + token_bytes_offset; +} + +void dataloader_init(DataLoader *loader, + const char* filename_pattern, + size_t B, + size_t T, + int process_rank, + int num_processes) { + loader->process_rank = process_rank; + loader->num_processes = num_processes; + loader->B = B; + loader->T = T; + loader->tokens_file = NULL; + + // glob to get the list of files matching the pattern, these are our data shards + int glob_status = glob(filename_pattern, 0, NULL, &loader->glob_result); + if (glob_status != 0) { + printf("Error: failed to glob pattern: %s\n", filename_pattern); + exit(EXIT_FAILURE); + } + if (loader->glob_result.gl_pathc == 0) { + printf("Error: no files found matching the pattern: %s\n", filename_pattern); + exit(EXIT_FAILURE); + } + + // inspect and validate all shards so we don't get any runtime errors later + // if too slow / too many shards, may wish to revisit later + int64_t ntok_total = 0; + for (int shard_index = 0; shard_index < loader->glob_result.gl_pathc; shard_index++) { + int64_t shard_ntok = dataloader_load_shard_(loader, shard_index); + // we need at least one batch/shard, the way things are written right now. + // can be relaxed a lot later. + assert(shard_ntok >= num_processes * B * T + 1); + ntok_total += shard_ntok; + } + // debugging prints + // printf("DataLoader: filename_pattern: %s\n", filename_pattern); + // printf("DataLoader: Found %ld tokens across %zu shards\n", ntok_total, loader->glob_result.gl_pathc); + + // allocate all the space we'll need + loader->buffer = (uint16_t*)malloc((B * T + 1) * sizeof(uint16_t)); + loader->inputs = (int*)malloc(B * T * sizeof(int)); + loader->targets = (int*)malloc(B * T * sizeof(int)); + loader->num_batches = ntok_total / (num_processes * B * T); // useful to know + + // reset the loader, to initialize it + dataloader_reset(loader); +} + +void dataloader_next_batch(DataLoader *loader) { + size_t B = loader->B; + size_t T = loader->T; + // read B*T+1 uint16_t tokens from the file into buffer + fseekCheck(loader->tokens_file, loader->current_position, SEEK_SET); + freadCheck(loader->buffer, sizeof(uint16_t), B*T+1, loader->tokens_file); + // decode the buffer into inputs and targets (cast to int) + for (int i = 0; i < B*T; i++) { + loader->inputs[i] = (int)loader->buffer[i]; + loader->targets[i] = (int)loader->buffer[i+1]; + } + // advance the current position by B*T*num_processes integers + // note: the "stride" of tokens by which we move each time is definitely B * T + // we only load B * T + 1 tokens at each iteration because the targets are offset by 1 + loader->current_position += loader->num_processes * B * T * sizeof(uint16_t); + // if the next batch would go past the end of the file, advance the loader + if (loader->current_position + (loader->num_processes * B * T + 1) * sizeof(uint16_t) > loader->file_size) { + dataloader_advance_(loader); + } +} + +void dataloader_free(DataLoader *loader) { + free(loader->buffer); + free(loader->inputs); + free(loader->targets); + fcloseCheck(loader->tokens_file); + globfree(&loader->glob_result); +} + +// ---------------------------------------------------------------------------- +// Distributed Eval Loader +// Many evals (like) HellaSwag and MMLU are multiple-choice +// where there are 4 possible continuations and a label for the correct one +// We want to load and serve these style of evals +/* +Copy pasting the section on the eval datafile format, from data_common.py: +- First comes a header with 256 int32s +- The examples follow, each example is a stream of uint16_t: + - delimiter of 2**16-1, i.e. 65,535 + - , bytes encoding this example, allowing efficient skip to next + - , the index of the example in the dataset + -