diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 52715eb9c..e4c211bc1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -12,15 +12,16 @@ jobs:
   build-and-test-cpu:
     strategy:
       matrix:
-        os: [ubuntu-latest, macos-latest]
+        os: [ubuntu-latest, macos-latest, windows-latest]
 
     runs-on: ${{ matrix.os }}
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Install OpenMP
+        if: matrix.os != 'windows-latest'
         run: |
           if [ "${{ runner.os }}" == "Linux" ]; then
             sudo apt-get update && sudo apt-get install -y libomp-dev
@@ -32,23 +33,110 @@ jobs:
         run: pip install -r requirements.txt
 
       - name: Run preprocessing
-        run: python prepro_tinyshakespeare.py
+        run: python dev/data/tinyshakespeare.py
 
       - name: Train model
         run: python train_gpt2.py --device=cpu
 
+      - name: Download Win32 Make.exe
+        if: matrix.os == 'windows-latest'
+        run: |
+            $wc = New-Object System.Net.WebClient
+            $url = 'https://github.com/maweil/MakeForWindows/releases/download/v4.4.1/make-bin-win64.zip'
+            $output = './make-bin-win64.zip'
+            $wc.DownloadFile($url, $output)
+
+      - name: Unzip Win32 Makefile
+        if: matrix.os == 'windows-latest'
+        run: |
+          unzip make-bin-win64.zip
+
       - name: Compile training and testing program
+        if: matrix.os != 'windows-latest'
         run: make test_gpt2 train_gpt2
 
+      - name: Compile training and testing program for Windows
+        if: matrix.os == 'windows-latest'
+        shell: cmd
+        run: |
+          call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\VC\\Auxiliary\\Build\\vcvars64.bat"
+          make-4.4.1\dist\make WIN_CI_BUILD=1 test_gpt2 train_gpt2
+
       - name: Execute testing program (With OpenMP)
+        if: matrix.os != 'windows-latest'
         run: OMP_NUM_THREADS=8 ./test_gpt2
 
+      - name: Execute Windows testing program (With OpenMP)
+        if: matrix.os == 'windows-latest'
+        shell: cmd
+        run: |
+          copy test_gpt2 test_gpt2.exe
+          test_gpt2.exe
+
       - name: Compile training and testing program without OpenMP
+        if: matrix.os != 'windows-latest'
         run: NO_OMP=1 make test_gpt2 train_gpt2
 
       - name: Execute testing program (No OpenMP)
+        if: matrix.os != 'windows-latest'
         run: ./test_gpt2
 
+  build-cuda-windows:
+    runs-on: windows-latest
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Download Win32 Make.exe
+      run: |
+          $wc = New-Object System.Net.WebClient
+          $url = 'https://github.com/maweil/MakeForWindows/releases/download/v4.4.1/make-bin-win64.zip'
+          $output = './make-bin-win64.zip'
+          $wc.DownloadFile($url, $output)
+
+    - name: Unzip Win32 Makefile
+      run: |
+        unzip make-bin-win64.zip
+
+    - name: Install Cuda Toolkit 12.4 on Windows
+      run: |
+        mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+        choco install unzip -y
+        curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
+        curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
+        curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
+        curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
+        curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
+        curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
+        curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
+        curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
+        curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
+        unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+        xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+        xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+        xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+        xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+        xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+        xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+        xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+        xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+        xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+
+    # Default installation path for CUDA Toolkit is C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4
+    - name: Add Path
+      run: |
+        echo "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.4\\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+        echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+        echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+        echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+
+    - name: Build Cuda targets
+      shell: cmd
+      working-directory: ${{ github.workspace }}
+      run: |
+        call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\VC\\Auxiliary\\Build\\vcvars64.bat"
+        make-4.4.1\dist\make -j WIN_CI_BUILD=1 train_gpt2fp32cu test_gpt2fp32cu test_gpt2cu train_gpt2cu profile_gpt2cu
+
   build-cuda-fp32:
     runs-on: ubuntu-latest
     container:
@@ -56,7 +144,7 @@ jobs:
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Build FP32 checkpoint
         run: make train_gpt2fp32cu test_gpt2fp32cu
@@ -71,7 +159,7 @@ jobs:
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Build project
         run: PRECISION=BF16 make test_gpt2cu train_gpt2cu profile_gpt2cu
@@ -83,7 +171,7 @@ jobs:
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Build project
         run: PRECISION=FP16 make test_gpt2cu train_gpt2cu profile_gpt2cu
@@ -95,7 +183,7 @@ jobs:
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Install OpenMP and OpenMPI
         run: apt-get update && apt-get install -y libomp-dev libopenmpi-dev
diff --git a/.gitignore b/.gitignore
index 5e88e4285..4f6c4a0c7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,12 +2,17 @@
 .vscode
 .venv
 
-# data files
-data
-
 # .bin files generated by Python
 *.bin
 
+# data directories
+dev/data/__pycache__/
+dev/data/fineweb10B/
+dev/data/hellaswag/
+dev/data/mmlu/
+dev/data/tinyshakespeare/
+dev/data/tinystories/
+
 # binaries
 test_gpt2
 test_gpt2cu
@@ -22,8 +27,10 @@ dev/cuda/classifier_fused
 dev/cuda/adamw
 dev/cuda/matmul_backward_bias
 dev/cuda/nccl_all_reduce
+dev/cuda/global_norm
 *.obj
 *.exe
+*.o
 
 # log files
 *.log
diff --git a/Makefile b/Makefile
index 04cbfbb2a..c8b555ac2 100644
--- a/Makefile
+++ b/Makefile
@@ -19,10 +19,34 @@ NVCC_INCLUDES =
 NVCC_LDLIBS =
 NCLL_INCUDES =
 NVCC_CUDNN =
-# overridable flag for multi-GPU training. by default we won't build with cudnn
-# because it bloats up the compile time from a few seconds to ~minute
+# By default we don't build with cudnn because it blows up compile time from a few seconds to ~minute
 USE_CUDNN ?= 0
 
+# Function to check if a file exists in the PATH
+ifneq ($(OS), Windows_NT)
+define file_exists_in_path
+  $(which $(1) 2>/dev/null)
+endef
+else
+define file_exists_in_path
+  $(shell where $(1) 2>nul)
+endef
+endif
+
+ifneq ($(CI),true) # if not in CI, then use the GPU query
+  ifndef GPU_COMPUTE_CAPABILITY # set to defaults if: make GPU_COMPUTE_CAPABILITY= 
+    ifneq ($(call file_exists_in_path, __nvcc_device_query),)
+      GPU_COMPUTE_CAPABILITY = $(shell __nvcc_device_query) 
+      GPU_COMPUTE_CAPABILITY := $(strip $(GPU_COMPUTE_CAPABILITY))
+    endif
+  endif
+endif
+
+# set to defaults if - make GPU_COMPUTE_CAPABILITY= otherwise use the compute capability detected above
+ifneq ($(GPU_COMPUTE_CAPABILITY),) 
+  NVCC_FLAGS += --generate-code arch=compute_$(GPU_COMPUTE_CAPABILITY),code=[compute_$(GPU_COMPUTE_CAPABILITY),sm_$(GPU_COMPUTE_CAPABILITY)]
+endif
+
 # autodect a lot of various supports on current platform
 $(info ---------------------------------------------)
 
@@ -67,27 +91,44 @@ else
 endif
 
 # Check and include cudnn if available
-# Currently hard-coding a bunch of stuff here for Linux, todo make this better/nicer
-# You need cuDNN from: https://developer.nvidia.com/cudnn
-# Follow the apt-get instructions
-# And the cuDNN front-end from: https://github.com/NVIDIA/cudnn-frontend/tree/main
-# For this there is no installation, just download the repo to your home directory
-# and then we include it below (see currently hard-coded path assumed in home directory)
+# You can override the path to cudnn frontend by setting CUDNN_FRONTEND_PATH on the make command line
+# By default, we look for it in HOME/cudnn-frontend/include and ./cudnn-frontend/include
+# Refer to the README for cuDNN install instructions
 ifeq ($(USE_CUDNN), 1)
   ifeq ($(SHELL_UNAME), Linux)
-    # hard-coded path for now
-    CUDNN_FRONTEND_PATH := $(HOME)/cudnn-frontend/include
-    ifeq ($(shell [ -d $(CUDNN_FRONTEND_PATH) ] && echo "exists"), exists)
+    ifeq ($(shell [ -d $(HOME)/cudnn-frontend/include ] && echo "exists"), exists)
+      $(info ✓ cuDNN found, will run with flash-attention)
+      CUDNN_FRONTEND_PATH ?= $(HOME)/cudnn-frontend/include
+    else ifeq ($(shell [ -d cudnn-frontend/include ] && echo "exists"), exists)
       $(info ✓ cuDNN found, will run with flash-attention)
+      CUDNN_FRONTEND_PATH ?= cudnn-frontend/include
+    else
+      $(error ✗ cuDNN not found. See the README for install instructions and the Makefile for hard-coded paths)
+    endif
+    NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH)
+    NVCC_LDFLAGS += -lcudnn
+    NVCC_FLAGS += -DENABLE_CUDNN
+    NVCC_CUDNN = cudnn_att.o
+  else 
+    ifneq ($(OS), Windows_NT)
+      $(info → cuDNN is not supported on MAC OS right now)
+    else
+      $(info ✓ Windows cuDNN found, will run with flash-attention)
+      ifeq ($(shell if exist "$(HOMEDRIVE)$(HOMEPATH)\cudnn-frontend\include" (echo exists)),exists)
+        CUDNN_FRONTEND_PATH ?= $(HOMEDRIVE)$(HOMEPATH)\cudnn-frontend\include #override on command line if different location
+      else ifeq ($(shell if exist "cudnn-frontend\include" (echo exists)),exists)
+        CUDNN_FRONTEND_PATH ?= cudnn-frontend\include #override on command line if different location
+      else
+        $(error ✗ cuDNN not found. See the README for install instructions and the Makefile for hard-coded paths) 
+      endif
+      CUDNN_INCLUDE_PATH ?= -I"C:\Program Files\NVIDIA\CUDNN\v9.1\include\12.4"
+      CUDNN_FRONTEND_PATH += $(CUDNN_INCLUDE_PATH)
+      NVCC_FLAGS += --std c++20 -Xcompiler "/std:c++20" -Xcompiler "/EHsc /W0 /nologo /Ox /FS" -maxrregcount=0 --machine 64
+      NVCC_CUDNN = cudnn_att.obj
       NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH)
-      NVCC_LDFLAGS += -lcudnn
+      NVCC_LDFLAGS += -L"C:\Program Files\NVIDIA\CUDNN\v9.1\lib\12.4\x64" -lcudnn 
       NVCC_FLAGS += -DENABLE_CUDNN
-      NVCC_CUDNN = cudnn_att.o
-    else
-      $(error ✗ cuDNN not found. See the Makefile for our currently hard-coded paths / install instructions)
     endif
-  else
-    $(info → cuDNN is not supported right now outside of Linux)
   endif
 else
   $(info → cuDNN is manually disabled by default, run make with `USE_CUDNN=1` to try to enable)
@@ -183,7 +224,7 @@ ifeq ($(NVCC),)
     $(info ✗ nvcc not found, skipping GPU/CUDA builds)
 else
     $(info ✓ nvcc found, including GPU/CUDA support)
-    TARGETS += train_gpt2cu test_gpt2cu train_gpt2fp32cu test_gpt2fp32cu
+    TARGETS += train_gpt2cu test_gpt2cu train_gpt2fp32cu test_gpt2fp32cu $(NVCC_CUDNN)
 endif
 
 $(info ---------------------------------------------)
@@ -191,28 +232,28 @@ $(info ---------------------------------------------)
 all: $(TARGETS)
 
 train_gpt2: train_gpt2.c
-	$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $< $(LDLIBS) $(OUTPUT_FILE)
+	$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $^ $(LDLIBS) $(OUTPUT_FILE)
 
 test_gpt2: test_gpt2.c
-	$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $< $(LDLIBS) $(OUTPUT_FILE)
+	$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $^ $(LDLIBS) $(OUTPUT_FILE)
 
-cudnn_att.o: cudnn_att.cu
-	$(NVCC) -c $(NVCC_FLAGS) $(PFLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS)
+$(NVCC_CUDNN): cudnn_att.cpp
+	$(NVCC) -c $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_INCLUDES) 
 
 train_gpt2cu: train_gpt2.cu $(NVCC_CUDNN)
-	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) $(NVCC_CUDNN)
+	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) 
 
 train_gpt2fp32cu: train_gpt2_fp32.cu
-	$(NVCC) $(NVCC_FLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
+	$(NVCC) $(NVCC_FLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
 
 test_gpt2cu: test_gpt2.cu $(NVCC_CUDNN)
-	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) $(NVCC_CUDNN)
+	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) 
 
 test_gpt2fp32cu: test_gpt2_fp32.cu
-	$(NVCC) $(NVCC_FLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
+	$(NVCC) $(NVCC_FLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
 
 profile_gpt2cu: profile_gpt2.cu $(NVCC_CUDNN)
-	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS)  $(CUDA_OUTPUT_FILE) $(NVCC_CUDNN)
+	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS)  $(CUDA_OUTPUT_FILE) 
 
 clean:
-	$(REMOVE_FILES) $(TARGETS)
+	$(REMOVE_FILES) $(TARGETS) $(NVCC_CUDNN)
diff --git a/README.md b/README.md
index dbb99e030..7b9c2d4fe 100644
--- a/README.md
+++ b/README.md
@@ -12,27 +12,27 @@ The "I don't care about anything I just want to train and I have a GPU" section.
 
 ```bash
 pip install -r requirements.txt
-python prepro_tinyshakespeare.py
+python dev/data/tinyshakespeare.py
 python train_gpt2.py
 make train_gpt2fp32cu
 ./train_gpt2fp32cu
 ```
 
-The above lines (1) download the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset, tokenize it with the GPT-2 Tokenizer, (2) download and save the GPT-2 (124M) weights, (3) init from them in C/CUDA and train for one epoch on tineshakespeare with AdamW (using batch size 4, context length 1024, total of 74 steps), evaluate validation loss, and sample some text. Note that in this quickstart we are using the fp32 version [train_gpt2_fp32.cu](train_gpt2_fp32.cu) of the CUDA code. Below in the CUDA section we document the current "mainline" [train_gpt2.cu](train_gpt2.cu), which is still being very actively developed, uses mixed precision, and runs ~2X faster.
+The above lines (1) download the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset, tokenize it with the GPT-2 Tokenizer, (2) download and save the GPT-2 (124M) weights, (3) init from them in C/CUDA and train for one epoch on tineshakespeare with AdamW (using batch size 4, context length 1024, total of 74 steps), evaluate validation loss, and sample some text. Note that in this quickstart we are using the fp32 version [train_gpt2_fp32.cu](train_gpt2_fp32.cu) of the CUDA code. In the next section we document the current "mainline" [train_gpt2.cu](train_gpt2.cu), which uses mixed precision, and runs ~2X faster.
 
 ## quick start (GPU, fast bleeding edge)
 
-I want to see it go fast. In this case switch to our mainline, most optimized `train_gpt2.cu` and also turn on flash attention. Run:
+I want to see it go fast. In this case switch to our mainline, most optimized `train_gpt2.cu`. Run:
 
 ```bash
 pip install -r requirements.txt
-python prepro_tinyshakespeare.py
+python dev/data/tinyshakespeare.py
 python train_gpt2.py
 make train_gpt2cu
 ./train_gpt2cu
 ```
 
-If you additionally install cuDNN (see `Makefile` for instructions), you can also go faster with flash attention
+If you additionally install cuDNN (see the CUDA section below), you can go even faster with flash attention. Adjust the make command as follows to compile with cudnn / flash attention:
 
 ```bash
 make train_gpt2cu USE_CUDNN=1
@@ -45,75 +45,102 @@ Note that the default batch size is very low (4). If you have enough memory on y
 ./train_gpt2cu -b 32
 ```
 
-My standard "prod" run with a nice GPU (e.g. A100 40GB) actually trains on TinyStories instead of TinyShakespeare, and looks like this:
+My standard single-GPU "prod" run (e.g. with a A100 40GB) trains on TinyStories instead of TinyShakespeare and looks like this, as an example:
 
 ```bash
-python prepro_tinystories.py
+python dev/data/tinystories.py
 make train_gpt2cu USE_CUDNN=1
-./train_gpt2cu -i data/TinyStories -v 250 -s 250 -g 144 -o stories.log -b 32
+./train_gpt2cu -i dev/data/tinystories/TinyStories_train.bin \
+               -j dev/data/tinystories/TinyStories_val.bin \
+               -v 250 -s 250 -g 144 -o stories.log -b 32
 ```
 
-Where I decrease the frequency of validation loss and sampling to every 250 steps, sample 144 tokens during sampling stage (to fit ~one story), and at batch size 32.
+The `-i` flag is a glob pattern for the input data, `-j` for the val data. In addition I decrease the frequency of validation loss and sampling to every 250 steps, sample 144 tokens during sampling stage (to fit ~one story), and at batch size 32.
 
-## quick start (CPU)
-
-The "I am so GPU poor that I don't even have one" section. No worries, run:
+If you want to train on actual, real pretraining data, check out the recently added support for [fineweb dataset](https://huggingface.co/datasets/HuggingFaceFW/fineweb). Unlike the datasets above where the train/val tokens fit into a single .bin file, we now have multiple data shards as well. Here is an example:
 
-```bash
-pip install -r requirements.txt
-python prepro_tinyshakespeare.py
-python train_gpt2.py
-make train_gpt2
-OMP_NUM_THREADS=8 ./train_gpt2
+```
+# write fineweb data in 100M token shards to dev/data/fineweb10B
+python dev/data/fineweb.py -s 100000000
+# compile and run
+./train_gpt2cu -i "dev/data/fineweb10B/fineweb_train_*.bin" \
+               -j "dev/data/fineweb10B/fineweb_val_*.bin" \
+               -v 250 -s 250 -g 144 -o fineweb.log -b 32
 ```
 
-The above lines (1) download the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset, tokenize it with the GPT-2 Tokenizer, (2) download and save the GPT-2 (124M) weights, (3) init from them in C and train for 40 steps on tineshakespeare with AdamW (using batch size 4, context length only 64), evaluate validation loss, and sample some text. Honestly, unless you have a beefy CPU (and can crank up the number of OMP threads in the launch command), you're not going to get that far on CPU training LLMs, but it might be a good demo/reference.
+Where you will notice the use of glob pattern `*` to match all the train shards.
 
 ## quick start (multiple GPUs)
 
-You'll be using the (more bleeding edge) mixed precision version of the code:
+Great, let's get even more serious. We're using MPI and NCCL for multi-GPU training. Everything in the section above applies, with the following changes:
 
-```
+```bash
+# example to install MPI:
 sudo apt install openmpi-bin openmpi-doc libopenmpi-dev
+# the run command is now preceeded by `mpirun`:
+mpirun -np <number of GPUs on your machine> ./train_gpt2cu
+```
+
+Sub in the number of GPUs you'd like to run on in the last command. All of the flags discussed in the section above apply here as well.
+
+## quick start (CPU)
+
+The "I am so GPU poor that I don't even have one" section. You can still train! But you won't go too far. You can still finetune a GPT-2 small (124M parameter model) to output Shakespeare-like text, as an example:
+
+```bash
 pip install -r requirements.txt
-python prepro_tinyshakespeare.py
+python dev/data/tinyshakespeare.py
 python train_gpt2.py
-make train_gpt2cu
-mpirun -np <number of GPUs on your machine> ./train_gpt2cu
+make train_gpt2
+OMP_NUM_THREADS=8 ./train_gpt2
 ```
 
-Sub in the number of GPUs you'd like to run on in the last command.
+The above lines (1) download the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset, tokenize it with the GPT-2 Tokenizer, (2) download and save the GPT-2 (124M) weights, (3) init from them in C and train for 40 steps on tineshakespeare with AdamW (using batch size 4, context length only 64), evaluate validation loss, and sample some text. Honestly, unless you have a beefy CPU (and can crank up the number of OMP threads in the launch command), you're not going to get that far on CPU training LLMs, but it might be a good demo/reference.
 
 ## training: more detail
 
-Download and tokenize a dataset. The [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset is the fastest to download and tokenize:
+The data files inside `/dev/data/(dataset).py` are responsible for downloading, tokenizing and saving the tokens to file. So for example when you run:
 
 ```bash
-python prepro_tinyshakespeare.py
+python dev/data/tinyshakespeare.py
 ```
 
-This prints:
+We download and tokenize the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset. The output of this looks like this:
 
 ```
-Saved 32768 tokens to data/tiny_shakespeare_val.bin
-Saved 305260 tokens to data/tiny_shakespeare_train.bin
+writing 32,768 tokens to ./dev/data/tinyshakespeare/tiny_shakespeare_val.bin
+writing 305,260 tokens to ./dev/data/tinyshakespeare/tiny_shakespeare_train.bin
 ```
 
-The .bin files are raw byte streams of int32 numbers indicating the token ids with the GPT-2 tokenizer. Alternatively you could also tokenize the [TinyStories](https://huggingface.co/datasets/roneneldan/TinyStories) dataset with `prepro_tinystories.py`.
+The .bin files contain a short header (1024 bytes) and then a stream of tokens in uint16, indicating the token ids with the GPT-2 tokenizer. More datasets are available in `/dev/data`.
 
-In principle we'd be ready to train the model right here. However the baseline CPU/fp32 reference code is so inefficient that it's not practical to train these models from scratch yet. Instead, we initialize with the GPT-2 weights released by OpenAI and just do finetuning. For that, we have to download the GPT-2 weights and save them as a checkpoint we can load in C:
+In principle, once we have the tokens, we'd be ready to train the model right here. However, current code can't start training from scratch just yet (coming very soon), so we initialize training from the pretrained models released by OpenAI and do finetuning. For that, we have to download the GPT-2 weights and save them as a checkpoint we can load in C. This is what happens when you run this script:
 
 ```bash
 python train_gpt2.py
 ```
 
-You'll recognize this code from nanoGPT as a simple GPT-2 reference implementation in PyTorch. This script will download the GPT-2 (124M) model, overfit a single batch of data for 10 iterations, run a few steps of generation, and most importantly it will save three files: 1) the `gpt2_124M.bin` file that contains the raw model weights for loading in C, 2) the `gpt2_124M_debug_state.bin`, which also contains more debug state: the inputs, targets, logits and loss (useful for debugging and unit testing), and finally 3) the `gpt2_tokenizer.bin` which stores the vocabulary for the GPT-2 tokenizer, translating token ids to byte sequences of UTF-8 encoded string pieces. We can now initialize with these model weights and continue training in raw C. First compile the code:
+You'll recognize this code from nanoGPT as a simple GPT-2 reference implementation in PyTorch. This script will download the GPT-2 (124M) model, overfit a single batch of data for 10 iterations, run a few steps of generation, and most importantly it will save three files: 1) the `gpt2_124M.bin` file that contains the raw model weights for loading in C, 2) the `gpt2_124M_debug_state.bin`, which also contains more debug state: the inputs, targets, logits and loss (useful for debugging and unit testing), and finally 3) the `gpt2_tokenizer.bin` which stores the vocabulary for the GPT-2 tokenizer, translating token ids to byte sequences of UTF-8 encoded string pieces. The file also saves both the fp32 versions of the above, and the bfloat16 versions of them for mixed precision training. We can now initialize with these model weights and continue training in raw C. Then we compile the training programs with `make`. There are currently three parallel implementations:
 
 ```bash
+# the simple, CPU, reference code version
 make train_gpt2
+# the single-GPU fp32 CUDA version
+make train_gpt2fp32cu
+# the multi-GPU mixed precision CUDA version
+make train_gpt2cu
 ```
 
-You can have a look inside the `Makefile` and its comments. It will try to autodetect if OpenMP is available on your system, which is very helpful for speeding up the code at very low cost of code complexity. Some people seem to experience problems compiling on Ubuntu, have a look at [Issue 19](https://github.com/karpathy/llm.c/issues/19), TLDR you'd want to modify the `CFLAGS`:
+You can have a look inside the `Makefile` and its comments. It will try to autodetect a lot of tools and libraries (e.g. cuDNN, OpenMP, OpenMPI, nvcc), and you want to get as many checkmarks as possible. For example when I run `make train_gpt2cu USE_CUDNN=1` on my fully configured machine, we see:
+
+```
+✓ cuDNN found, will run with flash-attention
+✓ OpenMP found
+✓ OpenMPI found, OK to train with multiple GPUs
+✓ nvcc found, including GPU/CUDA support
+```
+
+Some people seem to experience problems compiling on Ubuntu, have a look at [Issue 19](https://github.com/karpathy/llm.c/issues/19), TLDR you'd want to modify the `CFLAGS`:
 
 ```
 # try this first
@@ -122,7 +149,7 @@ CFLAGS="-Ofast -fno-finite-math-only -Wno-unused-result -march=native" make trai
 CFLAGS="-O3 -Wno-unused-result -march=native" make train_gpt2
 ```
 
-Once `train_gpt2` is compiled, you can run it:
+Once the binary is compiled, we can run it. For example the simplest CPU reference version runs as:
 
 ```bash
 OMP_NUM_THREADS=8 ./train_gpt2
@@ -164,18 +191,27 @@ Allay
 ---
 ```
 
-I like how Netflix comes up, it's clear that the shadow of the training past is still lurking in the model. I did not attempt to tune the finetuning hyperparameters so it's quite likely this can be improved quite a bit. I also noticed that slightly different platforms (e.g. MacOS / Linux) will (sadly) give very slightly different results, so perhaps don't expect to get the exact numbers or generation above. Also note that if you are seeing token ids instead of text in the generation, it might be because your code is out of date, as Tokenizer decoding was added April 14, 2024. `git pull` the updates, and then re-run `python train_gpt2.py`, which will now also save the tokenizer, which C can read and then use to print text instead of token ids.
+I like how Netflix comes up, it's clear that the shadow of the training past is still lurking in the model. I did not attempt to tune the finetuning hyperparameters so it's quite likely this can be improved quite a bit. I also noticed that slightly different platforms (e.g. MacOS / Linux) will (sadly) give very slightly different results, so perhaps don't expect to get the exact numbers or generation above.
+
+Finally, the code is in flux. If anything weird happens that you didn't expect or that worked previously, try to `git pull`, re-run all the commands above, reference back to this README, etc.
 
 ## test
 
-I am also attaching a simple unit test for making sure our C code agrees with the PyTorch code. Compile and run with:
+I am also attaching a simple unit test for making sure our C code agrees with the PyTorch code. On the CPU as an example, compile and run with:
 
 ```bash
 make test_gpt2
 ./test_gpt2
 ```
 
-This now loads the `gpt2_124M_debug_state.bin` file, runs a forward pass, compares the logits and loss with the PyTorch reference implementation, then it does 10 iterations of training with Adam and makes sure the losses match PyTorch.
+This now loads the `gpt2_124M_debug_state.bin` file, runs a forward pass, compares the logits and loss with the PyTorch reference implementation, then it does 10 iterations of training with Adam and makes sure the losses match PyTorch. To test the GPU version I run:
+
+```bash
+# fp32 test (cudnn not supported)
+make test_gpt2cu PRECISION=FP32 && ./test_gpt2cu
+# mixed precision cudnn test
+make test_gpt2cu USE_CUDNN=1 && ./test_gpt2cu
+```
 
 ## tutorial
 
@@ -183,7 +219,7 @@ I attached a very small tutorial here, in [doc/layernorm/layernorm.md](doc/layer
 
 ## CUDA
 
-The full training loop is also implemented in pure CUDA in one file, but optimizations of the kernels are ongoing. Currently, we roughly match the speed of PyTorch. The way we organize code is that we have a growing collection of kernels of increasing complexity in the `dev/cuda` folder, see [dev/cuda/README.md](dev/cuda/README.md). We then copy paste the best kernels into the main training loop in the single training file `train_gpt2cu.cu`.
+The full training loop is also implemented in pure CUDA in one file, but optimizations of the kernels are ongoing. Currently, we slightly exceed the speed of PyTorch Nightly. The way we organize code is that we have a growing collection of kernels of increasing complexity in the `dev/cuda` folder, see [dev/cuda/README.md](dev/cuda/README.md). We then copy paste the best kernels into the main training loop in the single training file `train_gpt2cu.cu`.
 
 **WIP alert, April 23**. We merged the first version of mixed precision training code. I checkpointed the fp32 version to separate files that include `_fp32` in their filename, and would like to preserve this version in the root of the repo because it 1) doesn't require the most up to date CUDA and will a lot more likely compile and is more portable, 2) it is a lot simpler and acts as reference. In fact, we'd like to diverge the fp32 version in the direction of being pure CUDA (e.g. do not even call cuBLAS by default), to be used as an educational reference, maybe even a kernel of a course on CUDA. The "mainline" development concerned with speed will from there on move to the [train_gpt2.cu](train_gpt2.cu) file, which includes mixed precision training.
 
@@ -198,7 +234,7 @@ make test_gpt2fp32cu
 
 This prints `overall okay: 1`. So the forward activations, backward gradients, and the individual loss values for 10 iterations all match exactly.
 
-**Training**. To train GPT-2 in a single file of CUDA, run the train script:
+**Training**. To train on single GPU in fp32:
 
 ```bash
 make train_gpt2fp32cu
@@ -228,9 +264,7 @@ For on his rock shall he be opencast.
 Keep on with me, my
 ```
 
-This runs on my A100 in about ~10 seconds. This training loop in the PyTorch script is about 80ms/iteration, so we are slightly better than PyTorch here. However, this is measured with PyTorch that is a bit stale (I'm on 2.1.0) and we're not yet including FlashAttention or the PyTorch scaled_dot_product_attention fused operation.
-
-We can compare to naive PyTorch like this, where we turn on `torch.compile` and the use of TensorCores, which use tf32 type:
+This runs on my A100 in about ~10 seconds. We can compare to naive PyTorch like this, where we turn on `torch.compile` and the use of TensorCores, which use tf32 type:
 
 ```bash
 python train_gpt2.py --write_tensors 0 --sequence_length 1024 --batch_size 4 --compile 1 --tensorcores 1
@@ -256,7 +290,16 @@ If you have the latest CUDA you should expect this to compile OK, and you should
 make train_gpt2cu USE_CUDNN=1
 ```
 
-This will try to compile with cudnn and run it. You have to have cuDNN installed on your system. Follow the [cuDNN installation instructions](https://developer.nvidia.com/cudnn) to install cuDNN with apt-get. On top of this you need the [cuDNN frontend](https://github.com/NVIDIA/cudnn-frontend/tree/main), but this is just header files. So simply download the repo to your disk, currently assumed to be in your home directory (i.e. the Makefile looks for `~/cudnn-frontend/include`).
+This will try to compile with cudnn and run it. You have to have cuDNN installed on your system. The [cuDNN installation instructions](https://developer.nvidia.com/cudnn) with apt-get will grab the default set of cuDNN packages. For a minimal setup, the cuDNN dev package is sufficient, e.g. on Ubuntu 22.04 for CUDA 12.x:
+
+```bash
+wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
+sudo apt-get update
+sudo apt-get -y install libcudnn9-dev-cuda-12
+```
+
+On top of this you need the [cuDNN frontend](https://github.com/NVIDIA/cudnn-frontend/tree/main), but this is just header files. Simply clone the repo to your disk. The Makefile currently looks for it in either your home directory or the current directory. If you have put it elsewhere, add `CUDNN_FRONTEND_PATH=/path/to/your/cudnn-frontend/include` to the `make` command-line.
 
 **Multi-GPU training**. As of April 26, 2024 there is now also support for multi-GPU training using MPI and NCCL. Make sure you install MPI, e.g. on Linux:
 
@@ -333,8 +376,12 @@ Lastly, I will be a lot more sensitive to complexity in the root folder of the p
 
 ## notable forks
 
+- AMD support
+  - [llm.c](https://github.com/anthonix/llm.c) by @[anthonix](https://github.com/anthonix): support for AMD devices, such as the 7900 XTX
+
 - C#
   - [llm.cs](https://github.com/azret/llm.cs) by @[azret](https://github.com/azret): a C# port of this project
+  - [Llm.cs](https://github.com/nietras/Llm.cs) by @[nietras](https://github.com/nietras): a C# port of this project with focus on easy to get started on any platform. Clone and run ✅
 
 - CUDA C++
   - [llm.cpp](https://github.com/gevtushenko/llm.c) by @[gevtushenko](https://github.com/gevtushenko): a port of this project using the [CUDA C++ Core Libraries](https://github.com/NVIDIA/cccl)
@@ -353,12 +400,20 @@ Lastly, I will be a lot more sensitive to complexity in the root folder of the p
   - [llm.🔥](https://github.com/dorjeduck/llm.mojo) by @[dorjeduck](https://github.com/dorjeduck): a Mojo port of this project
 
 - Rust
+  -  [llm.rs](https://github.com/yijunyu/llm.rs) by @[Yijun Yu](https://github.com/yijunyu): a Rust rewrite with the aim to have same performance
   -  [llm.rs](https://github.com/ToJen/llm.rs) by @[ToJen](https://github.com/ToJen): a Rust port of this project
 
+- Swift
+  - [llm.swift](https://github.com/otabuzzman/llm.swift) by @[otabuzzman](https://github.com/otabuzzman): a Swift port of this project
+
 - Zig
   - [llm.zig](https://github.com/Saimirbaci/llm.zig) by @[saimirbaci](https://github.com/Saimirbaci): a Zig port of this project
 
 
+## major changes log
+
+- **May 21, 2024: Dataset refactor**. I refactored the .bin files that hold the tokens to include a header like all the other .bin files that e.g. store the model weights. This was necessary to support multiple versions and future development. Unfortunately, this will brick everyone's master the next time you `git pull`, because the .bin files you've generated before are the legacy version. To fix this, you only have to re-generate the data in the new format. For example, for Tiny Shakespeare run: `python dev/data/tinyshakespeare.py`. For Tiny Stories, `python dev/data/tinystories.py`. Also notice that the location of these data files has changed. They used to just be "flat" and inside `data/` folder, but now all the data-related code was moved to `dev/data` files and sub-directories, to keep things organized. Apologies for breaking change, I'll try not to brick master too much in general.
+
 ## discussions
 
 Ways of organizing development:
diff --git a/cudnn_att.cu b/cudnn_att.cpp
similarity index 58%
rename from cudnn_att.cu
rename to cudnn_att.cpp
index 2735bbd14..04b1a92ec 100644
--- a/cudnn_att.cu
+++ b/cudnn_att.cpp
@@ -5,18 +5,22 @@
 #include <cudnn_frontend.h>
 #include <cuda_bf16.h>
 #include <nvtx3/nvToolsExt.h>
+namespace fe = cudnn_frontend;
 
 // Specific configurations based on the enabled precision
 #if defined(ENABLE_FP32)
 typedef float floatX;
+static_assert(false, "cuDNN is not supported in FP32 mode.")
 
 // use fp16 (note: this may require gradient scaler, currently not implemented!)
 #elif defined(ENABLE_FP16)
 typedef half floatX;
 #define CUBLAS_LOWP CUDA_R_16F
+#define CUDNN_16BIT fe::DataType_t::HALF
 
 #else // Default to bfloat16
 typedef __nv_bfloat16 floatX;
+#define CUDNN_16BIT fe::DataType_t::BFLOAT16
 #endif
 
 // CUDA error checking
@@ -34,24 +38,15 @@ namespace {
     class NvtxRange {
     public:
         NvtxRange(const char* s) { nvtxRangePush(s); }
-
         NvtxRange(const std::string& base_str, int number) {
             std::string range_string = base_str + " " + std::to_string(number);
             nvtxRangePush(range_string.c_str());
         }
-
         ~NvtxRange() { nvtxRangePop(); }
     };
 }
 #define NVTX_RANGE_FN() NvtxRange nvtx_range(__FUNCTION__)
 
-namespace fe = cudnn_frontend;
-#if CUBLAS_LOWP == CUDA_R_16BF
-#define CUDNN_16BIT fe::DataType_t::BFLOAT16
-#else
-#define CUDNN_16BIT fe::DataType_t::HALF
-#endif
-
 static cudnnHandle_t cudnn_handle;
 static size_t cudnn_workspace_size = 0; // dynamically allocated as needed (up to 256MiB!)
 static void* cudnn_workspace = NULL;
@@ -65,62 +60,59 @@ static void checkCudnnFE(fe::error_object e, const char *file, int line) {
 }
 #define checkCudnnFE(err) checkCudnnFE(err, __FILE__, __LINE__)
 
-using graph_tensors_fwd = std::tuple<std::shared_ptr<fe::graph::Graph>,
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // Q,
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // K,
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // V,
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // Attn_scale,
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // O
-    std::shared_ptr<fe::graph::Tensor_attributes> // Stats
->;
-
-using graph_tensors_bwd = std::tuple<std::shared_ptr<fe::graph::Graph>,
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // Q,
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // K,
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // V,
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // O
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // dO
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // Stats
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // Attn_scale,
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // dQ,
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // dK,
-    std::shared_ptr<fe::graph::Tensor_attributes> // dV
->;
+enum UIDs {
+    Q_UID,
+    K_UID,
+    V_UID,
+    Attn_scale_UID,
+    O_UID,
+    Stats_UID,
+    dO_UID,
+    dQ_UID,
+    dK_UID,
+    dV_UID
+};
 
 // Need a cache because graph->build_operation_graph() is slow but everything else seems fast
-using cache_type_fwd = std::unordered_map<std::size_t, graph_tensors_fwd>;
-using cache_type_bwd = std::unordered_map<std::size_t, graph_tensors_bwd>;
+using cache_type_fwd = std::map<std::tuple<int,int,int,int, int>, std::shared_ptr<fe::graph::Graph>>;
+using cache_type_bwd = std::map<std::tuple<int,int,int,int>, std::shared_ptr<fe::graph::Graph>>;
 
 // Loosely based on cuDNN frontend samples functions and massively simplified
-template <typename... Args>
-auto lookup_cache_or_build_graph_fwd(Args... args) {
+auto lookup_cache_or_build_graph_fwd(int B,int H,int T,int HS, int is_inference_only) {
+
     static cache_type_fwd user_maintained_cache_fwd;
-    auto [B, H, T, HS, is_inference_only] = std::make_tuple(args...);
 
+    auto key = std::make_tuple(B, H, T, HS, is_inference_only);
+
+    auto it = user_maintained_cache_fwd.find(key);
+    if (it != user_maintained_cache_fwd.end()) {
+        return it->second;
+    }
+    
     auto graph = std::make_shared<fe::graph::Graph>();
     graph->set_io_data_type(CUDNN_16BIT)
-        .set_intermediate_data_type(fe::DataType_t::FLOAT)
-        .set_compute_data_type(fe::DataType_t::FLOAT);
+          .set_intermediate_data_type(fe::DataType_t::FLOAT)
+          .set_compute_data_type(fe::DataType_t::FLOAT);
 
     // QKV is (B, T, 3, NH, HS) which cuDNN can handle directly without an external permute
-    auto Q = graph->tensor(fe::graph::Tensor_attributes()
-                               .set_name("Q")
+    auto Q = graph->tensor(fe::graph::Tensor_attributes().set_name("Q")
                                .set_dim({B, H, T, HS})
+                               .set_uid(Q_UID)
                                .set_stride({3 * H * HS * T,  HS, 3 * H * HS, 1}));
-    auto K = graph->tensor(fe::graph::Tensor_attributes()
-                               .set_name("K")
+    auto K = graph->tensor(fe::graph::Tensor_attributes().set_name("K")
                                .set_dim({B, H, T, HS})
+                               .set_uid(K_UID)
                                .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1}));
-    auto V = graph->tensor(fe::graph::Tensor_attributes()
-                               .set_name("V")
+    auto V = graph->tensor(fe::graph::Tensor_attributes().set_name("V")
                                .set_dim({B, H, T, HS})
+                               .set_uid(V_UID)
                                .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1}));
-    auto attn_scale = graph->tensor(fe::graph::Tensor_attributes()
-                                        .set_name("attn_scale")
-                                        .set_dim({1, 1, 1, 1})
-                                        .set_stride({1, 1, 1, 1})
-                                        .set_is_pass_by_value(true)
-                                        .set_data_type(fe::DataType_t::FLOAT));
+    auto attn_scale = graph->tensor(fe::graph::Tensor_attributes().set_name("attn_scale")
+                               .set_dim({1, 1, 1, 1})
+                               .set_stride({1, 1, 1, 1})
+                               .set_uid(Attn_scale_UID)
+                               .set_is_pass_by_value(true)
+                               .set_data_type(fe::DataType_t::FLOAT));
 
     auto sdpa_options = fe::graph::SDPA_attributes().set_name("flash_attention");
     sdpa_options.set_is_inference(is_inference_only);
@@ -131,95 +123,99 @@ auto lookup_cache_or_build_graph_fwd(Args... args) {
     auto [O, stats] = graph->sdpa(Q, K, V, sdpa_options);
 
     // Output is (B, T, NH, HS) BF16/FP16 and stats for backward pass is (B, NH, T) FP32
-    O->set_output(true).set_dim({B, H, T, HS}).set_stride({H * HS * T, HS, H * HS, 1});
+    O->set_output(true).set_dim({B, H, T, HS}).set_stride({H * HS * T, HS, H * HS, 1}).set_uid(O_UID);
 
     assert(stats == nullptr || is_inference_only == false);
     if (is_inference_only == false) {
         stats->set_output(true).set_data_type(fe::DataType_t::FLOAT)
-            .set_dim({B, H, T, 1})
-            .set_stride({H * T, T, 1, 1});
+                               .set_dim({B, H, T, 1})
+                               .set_stride({H * T, T, 1, 1})
+                               .set_uid(Stats_UID);
     }
 
     checkCudnnFE(graph->validate());
-    auto key = graph->key();
-    auto it = user_maintained_cache_fwd.find(key);
-    if (it != user_maintained_cache_fwd.end()) {
-        return it->second;
-    }
 
     // Build the operation graph and execution part (this is the VERY SLOW PART)
     checkCudnnFE(graph->build_operation_graph(cudnn_handle));
     auto plans = graph->create_execution_plans({fe::HeurMode_t::A});
     checkCudnnFE(graph->check_support(cudnn_handle));
     checkCudnnFE(graph->build_plans(cudnn_handle));
+    // Reallocate the workspace if the required size is greater than the current workspace
+    // In H100 this may be around 16B
+    if (graph->get_workspace_size() > cudnn_workspace_size) {
+        if (cudnn_workspace_size > 0) {
+            cudaCheck(cudaFree(cudnn_workspace));
+        }
+        cudnn_workspace_size = graph->get_workspace_size();
+        cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size));
+    }
 
-    auto tuple = std::make_tuple(graph, Q, K, V, attn_scale, O, stats);
-    user_maintained_cache_fwd.insert({key, tuple});
-    return tuple;
+    user_maintained_cache_fwd.insert({key, graph});
+
+    return graph;
 }
 
-template <typename... Args>
-auto lookup_cache_or_build_graph_bwd(Args... args) {
+auto lookup_cache_or_build_graph_bwd(int B, int NH, int T, int HS) {
     static cache_type_bwd user_maintained_cache_bwd;
-    auto [B, NH, T, HS] = std::make_tuple(args...);
+
+    auto key = std::make_tuple(B, NH, T, HS);
+
+    auto it = user_maintained_cache_bwd.find(key);
+    if (it != user_maintained_cache_bwd.end()) {
+        return it->second;
+    }
 
     auto graph = std::make_shared<fe::graph::Graph>();
     graph->set_io_data_type(CUDNN_16BIT)
-        .set_intermediate_data_type(fe::DataType_t::FLOAT)
-        .set_compute_data_type(fe::DataType_t::FLOAT);
+          .set_intermediate_data_type(fe::DataType_t::FLOAT)
+          .set_compute_data_type(fe::DataType_t::FLOAT);
 
     // (B, N, 3, NH, HS)
     // must come from inp (which means we also need to convert THAT to FP16)
-    auto Q = graph->tensor(fe::graph::Tensor_attributes()
-                               .set_name("Q")
-                               .set_dim({B, NH, T, HS})
-                               .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}));
-    auto K = graph->tensor(fe::graph::Tensor_attributes()
-                               .set_name("K")
-                               .set_dim({B, NH, T, HS})
-                               .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}));
-    auto V = graph->tensor(fe::graph::Tensor_attributes()
-                               .set_name("V")
-                               .set_dim({B, NH, T, HS})
-                               .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}));
-    auto O = graph->tensor(fe::graph::Tensor_attributes()
-                               .set_name("O")
-                               .set_dim({B, NH, T, HS})
-                               .set_stride({NH * HS * T, HS, NH * HS, 1}));
-    auto dO = graph->tensor(fe::graph::Tensor_attributes()
-                                .set_name("dO")
-                                .set_dim({B, NH, T, HS})
-                                .set_stride({NH * HS * T, HS, NH * HS, 1}));
-
-    auto stats = graph->tensor(fe::graph::Tensor_attributes()
-                                   .set_name("stats")
-                                   .set_dim({B, NH, T, 1})
-                                   .set_stride({NH * T, T, 1, 1})
-                                   .set_data_type(fe::DataType_t::FLOAT));
-    auto attn_scale = graph->tensor(fe::graph::Tensor_attributes()
-                                        .set_name("attn_scale")
-                                        .set_dim({1, 1, 1, 1})
-                                        .set_stride({1, 1, 1, 1})
-                                        .set_is_pass_by_value(true)
-                                        .set_data_type(fe::DataType_t::FLOAT));
-    auto sdpa_backward_options = fe::graph::SDPA_backward_attributes()
-        .set_name("flash_attention_backward")
-        .set_causal_mask(true)
-        .set_attn_scale(attn_scale);
+    auto Q = graph->tensor(fe::graph::Tensor_attributes().set_name("Q")
+                            .set_dim({B, NH, T, HS})
+                            .set_uid(Q_UID)
+                            .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}));
+    auto K = graph->tensor(fe::graph::Tensor_attributes().set_name("K")
+                            .set_dim({B, NH, T, HS})
+                            .set_uid(K_UID)
+                            .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}));
+    auto V = graph->tensor(fe::graph::Tensor_attributes().set_name("V")
+                            .set_dim({B, NH, T, HS})
+                            .set_uid(V_UID)
+                            .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}));
+    auto O = graph->tensor(fe::graph::Tensor_attributes().set_name("O")
+                            .set_dim({B, NH, T, HS})
+                            .set_uid(O_UID)
+                            .set_stride({NH * HS * T, HS, NH * HS, 1}));
+    auto dO = graph->tensor(fe::graph::Tensor_attributes().set_name("dO")
+                            .set_dim({B, NH, T, HS})
+                            .set_uid(dO_UID)
+                            .set_stride({NH * HS * T, HS, NH * HS, 1}));
+
+    auto stats = graph->tensor(fe::graph::Tensor_attributes().set_name("stats")
+                            .set_dim({B, NH, T, 1})
+                            .set_uid(Stats_UID)
+                            .set_stride({NH * T, T, 1, 1})
+                            .set_data_type(fe::DataType_t::FLOAT));
+    auto attn_scale = graph->tensor(fe::graph::Tensor_attributes().set_name("attn_scale")
+                            .set_dim({1, 1, 1, 1})
+                            .set_stride({1, 1, 1, 1})
+                            .set_is_pass_by_value(true)
+                            .set_uid(Attn_scale_UID)
+                            .set_data_type(fe::DataType_t::FLOAT));
+    auto sdpa_backward_options = fe::graph::SDPA_backward_attributes().set_name("flash_attention_backward")
+                            .set_causal_mask(true)
+                            .set_attn_scale(attn_scale);
 
     // Create the graph operation and get the output tensors back
     auto [dQ, dK, dV] = graph->sdpa_backward(Q, K, V, O, dO, stats, sdpa_backward_options);
 
-    dQ->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1});
-    dK->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1});
-    dV->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1});
+    dQ->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}).set_uid(dQ_UID);
+    dK->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}).set_uid(dK_UID);
+    dV->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}).set_uid(dV_UID);
 
     checkCudnnFE(graph->validate());
-    auto key = graph->key();
-    auto it = user_maintained_cache_bwd.find(key);
-    if (it != user_maintained_cache_bwd.end()) {
-        return it->second;
-    }
 
     // Build the operation graph and execution part (this is the VERY SLOW PART)
     checkCudnnFE(graph->build_operation_graph(cudnn_handle));
@@ -227,9 +223,18 @@ auto lookup_cache_or_build_graph_bwd(Args... args) {
     checkCudnnFE(graph->check_support(cudnn_handle));
     checkCudnnFE(graph->build_plans(cudnn_handle));
 
-    auto tuple = std::make_tuple(graph, Q, K, V, O, dO, stats, attn_scale, dQ, dK, dV);
-    user_maintained_cache_bwd.insert({key, tuple});
-    return tuple;
+    // Reallocate the workspace if the required size is greater than the current workspace
+    // By default, cuDNN uses up to 256MiB of workspace, so we don't want to just allocate the maximum
+    if (graph->get_workspace_size() > cudnn_workspace_size) {
+        if (cudnn_workspace_size > 0) {
+            cudaCheck(cudaFree(cudnn_workspace));
+        }
+        cudnn_workspace_size = graph->get_workspace_size();
+        cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size));
+    }
+
+    user_maintained_cache_bwd.insert({key, graph});
+    return graph;
 }
 
 void attention_forward_cudnn(floatX* out,  // output: (B, T, NH, HS)
@@ -241,8 +246,7 @@ void attention_forward_cudnn(floatX* out,  // output: (B, T, NH, HS)
     bool is_inference_only = (stats == nullptr);
 
     // Get graph and tensors from cache (or generate it on first use)
-    auto [graph, Q, K, V, attn_scale, O, softmax_stats] =
-        lookup_cache_or_build_graph_fwd(B, NH, T, HS, is_inference_only);
+    auto graph = lookup_cache_or_build_graph_fwd(B, NH, T, HS, is_inference_only);
 
     // Prepare all the tensor pointers for executing the graph
     void* devPtrQ = inp;
@@ -252,22 +256,12 @@ void attention_forward_cudnn(floatX* out,  // output: (B, T, NH, HS)
     void* devPtrO = out;
 
     // Build variant pack
-    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
-        {Q, devPtrQ}, {K, devPtrK}, {V, devPtrV}, {attn_scale, &attn_scale_cpu}, {O, devPtrO}};
+    std::unordered_map<int64_t , void*> variant_pack = {
+        {Q_UID, devPtrQ}, {K_UID, devPtrK}, {V_UID, devPtrV}, {Attn_scale_UID, &attn_scale_cpu}, {O_UID, devPtrO}};
 
     // Add the stats tensor unless we are only doing inference (only needed for backward pass)
     if (is_inference_only == false) {
-        variant_pack[softmax_stats] = stats;
-    }
-
-    // Reallocate the workspace if the required size is greater than the current workspace
-    // By default, cuDNN uses up to 256MiB of workspace, so we don't want to just allocate the maximum
-    if (graph->get_workspace_size() > cudnn_workspace_size) {
-        if (cudnn_workspace_size > 0) {
-            cudaCheck(cudaFree(cudnn_workspace));
-        }
-        cudnn_workspace_size = graph->get_workspace_size();
-        cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size));
+        variant_pack[Stats_UID] = stats;
     }
 
     // Execute graph
@@ -282,8 +276,7 @@ void attention_backward_cudnn(floatX* dqkvr,
     int HS = C / NH; // number of features per head
 
     // Get graph and tensors from cache (or generate it on first use)
-    auto [graph, Q, K, V, O, dO, Stats, attn_scale, dQ, dK, dV] =
-        lookup_cache_or_build_graph_bwd(B, NH, T, HS);
+    auto graph = lookup_cache_or_build_graph_bwd(B, NH, T, HS);
 
     // Prepare all the tensor pointers for executing the graph
     void* devPtrQ = qkvr;
@@ -299,20 +292,10 @@ void attention_backward_cudnn(floatX* dqkvr,
     void* devPtrdV = (dqkvr + 2 * NH * HS);
 
     // Build variant pack that links each tensor to its data pointer
-    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
-        {Q, devPtrQ}, {K, devPtrK}, {V, devPtrV}, {O, devPtrO}, {dO, devPtrdO}, {Stats, devPtrStats},
-        {dQ, devPtrdQ}, {dK, devPtrdK}, {dV, devPtrdV},
-        {attn_scale, &attn_scale_cpu}};
-
-    // Reallocate the workspace if the required size is greater than the current workspace
-    // By default, cuDNN uses up to 256MiB of workspace, so we don't want to just allocate the maximum
-    if (graph->get_workspace_size() > cudnn_workspace_size) {
-        if (cudnn_workspace_size > 0) {
-            cudaCheck(cudaFree(cudnn_workspace));
-        }
-        cudnn_workspace_size = graph->get_workspace_size();
-        cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size));
-    }
+    std::unordered_map<int64_t, void*> variant_pack = {
+        {Q_UID, devPtrQ}, {K_UID, devPtrK}, {V_UID, devPtrV}, {O_UID, devPtrO}, {dO_UID, devPtrdO}, {Stats_UID, devPtrStats},
+        {dQ_UID, devPtrdQ}, {dK_UID, devPtrdK}, {dV_UID, devPtrdV},
+        {Attn_scale_UID, &attn_scale_cpu}};
 
     // Execute graph
     checkCudnnFE(graph->execute(cudnn_handle, variant_pack, cudnn_workspace));
diff --git a/dataloader.h b/dataloader.h
new file mode 100644
index 000000000..6b63c34a1
--- /dev/null
+++ b/dataloader.h
@@ -0,0 +1,444 @@
+/*
+Implements a medium simple DataLoader for a distributed training setup.
+*/
+#ifndef DATALOADER_H
+#define DATALOADER_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <assert.h>
+#include <string.h>
+// defines: fopenCheck, freadCheck, fcloseCheck, fseekCheck
+// defines: mallocCheck
+#include "utils.h"
+
+// ----------------------------------------------------------------------------
+// implementation of glob for Windows is in dev/unistd.h 
+#ifndef _WIN32
+#include <glob.h>
+#endif
+// ----------------------------------------------------------------------------
+// Distributed Data Loader
+#define HEADER_SIZE 256
+
+typedef struct {
+    // variables related to distributed training
+    // each process/worker has to access different parts of the data
+    int process_rank;
+    int num_processes;
+    // hyperparameters. use size_t to prevent overflow
+    size_t B;
+    size_t T;
+    // input handling and its state
+    glob_t glob_result; // stores the result of glob, for all shards we want to iterate
+    int current_shard; // the current shard we are reading from
+    FILE* tokens_file;
+    int64_t file_size;
+    int64_t current_position;
+    uint16_t* buffer; // we fread data from file into this buffer
+    // public variables that could be accessed from outside
+    size_t num_batches;
+    int* inputs;  // input tokens into transformer
+    int* targets; // target tokens for the transformer
+} DataLoader;
+
+int64_t dataloader_load_shard_(DataLoader *loader, int shard_index) {
+    // use the first glob match as the filename for now
+    const char* filename = loader->glob_result.gl_pathv[shard_index];
+    // open the input file for reading. also only a single file can be opened at a time
+    if (loader->tokens_file != NULL) {
+        fcloseCheck(loader->tokens_file);
+    }
+    loader->tokens_file = fopenCheck(filename, "rb");
+    // validate the header
+    int header[HEADER_SIZE];
+    freadCheck(header, sizeof(int), HEADER_SIZE, loader->tokens_file);
+    if (header[0] != 20240520) {
+        printf("Bad magic in the data file\n");
+        printf("---> HINT: Are you passing in a correct file?\n");
+        printf("---> HINT: The data encoding may have changed, re-run data prepro or refer again to README.\n");
+        exit(EXIT_FAILURE);
+    }
+    if (header[1] != 1) { printf("Bad version in data file\n"); exit(EXIT_FAILURE); }
+    int64_t ntok = header[2]; // number of tokens in the file
+    assert(ntok > 0); // we expect some tokens in the file. this should never trip, right?
+    // determine the file size and make sure it is consistent with the number of tokens
+    fseekCheck(loader->tokens_file, 0, SEEK_END); // seek to end of file
+    loader->file_size = ftell(loader->tokens_file); // read the offset, i.e. file size
+    fseekCheck(loader->tokens_file, 0, SEEK_SET); // seek back to the beginning
+    // we expect ntok in the file to be consistent with filesize, assert that is the case
+    int64_t expected_file_size = HEADER_SIZE * sizeof(int) + ntok * sizeof(uint16_t);
+    if (loader->file_size != expected_file_size) {
+        printf("Error: file size is not as expected\n");
+        exit(EXIT_FAILURE);
+    }
+    return ntok;
+}
+
+void dataloader_reset(DataLoader *loader) {
+    // fully resets the DataLoader object to init configuration
+    // each process starts at a different offset in the file
+    int64_t header_bytes = HEADER_SIZE * sizeof(int);
+    int64_t token_bytes_offset = loader->process_rank * loader->B * loader->T * sizeof(uint16_t);
+    loader->current_shard = 0;
+    loader->current_position = header_bytes + token_bytes_offset;
+    dataloader_load_shard_(loader, loader->current_shard);
+}
+
+void dataloader_advance_(DataLoader *loader) {
+    // advance the loader by loading the next data shard and resetting the position
+    if (loader->glob_result.gl_pathc > 1) {
+        // if we have more than one shard, advance to the next one
+        loader->current_shard = (loader->current_shard + 1) % loader->glob_result.gl_pathc;
+        dataloader_load_shard_(loader, loader->current_shard);
+    }
+    int64_t header_bytes = HEADER_SIZE * sizeof(int);
+    int64_t token_bytes_offset = loader->process_rank * loader->B * loader->T * sizeof(uint16_t);
+    loader->current_position = header_bytes + token_bytes_offset;
+}
+
+void dataloader_init(DataLoader *loader,
+                     const char* filename_pattern,
+                     size_t B,
+                     size_t T,
+                     int process_rank,
+                     int num_processes) {
+    loader->process_rank = process_rank;
+    loader->num_processes = num_processes;
+    loader->B = B;
+    loader->T = T;
+    loader->tokens_file = NULL;
+
+    // glob to get the list of files matching the pattern, these are our data shards
+    int glob_status = glob(filename_pattern, 0, NULL, &loader->glob_result);
+    if (glob_status != 0) {
+        printf("Error: failed to glob pattern: %s\n", filename_pattern);
+        exit(EXIT_FAILURE);
+    }
+    if (loader->glob_result.gl_pathc == 0) {
+        printf("Error: no files found matching the pattern: %s\n", filename_pattern);
+        exit(EXIT_FAILURE);
+    }
+
+    // inspect and validate all shards so we don't get any runtime errors later
+    // if too slow / too many shards, may wish to revisit later
+    int64_t ntok_total = 0;
+    for (int shard_index = 0; shard_index < loader->glob_result.gl_pathc; shard_index++) {
+        int64_t shard_ntok = dataloader_load_shard_(loader, shard_index);
+        // we need at least one batch/shard, the way things are written right now.
+        // can be relaxed a lot later.
+        assert(shard_ntok >= num_processes * B * T + 1);
+        ntok_total += shard_ntok;
+    }
+    // debugging prints
+    // printf("DataLoader: filename_pattern: %s\n", filename_pattern);
+    // printf("DataLoader: Found %ld tokens across %zu shards\n", ntok_total, loader->glob_result.gl_pathc);
+
+    // allocate all the space we'll need
+    loader->buffer = (uint16_t*)malloc((B * T + 1) * sizeof(uint16_t));
+    loader->inputs = (int*)malloc(B * T * sizeof(int));
+    loader->targets = (int*)malloc(B * T * sizeof(int));
+    loader->num_batches = ntok_total / (num_processes * B * T); // useful to know
+
+    // reset the loader, to initialize it
+    dataloader_reset(loader);
+}
+
+void dataloader_next_batch(DataLoader *loader) {
+    size_t B = loader->B;
+    size_t T = loader->T;
+    // read B*T+1 uint16_t tokens from the file into buffer
+    fseekCheck(loader->tokens_file, loader->current_position, SEEK_SET);
+    freadCheck(loader->buffer, sizeof(uint16_t), B*T+1, loader->tokens_file);
+    // decode the buffer into inputs and targets (cast to int)
+    for (int i = 0; i < B*T; i++) {
+        loader->inputs[i] = (int)loader->buffer[i];
+        loader->targets[i] = (int)loader->buffer[i+1];
+    }
+    // advance the current position by B*T*num_processes integers
+    // note: the "stride" of tokens by which we move each time is definitely B * T
+    // we only load B * T + 1 tokens at each iteration because the targets are offset by 1
+    loader->current_position += loader->num_processes * B * T * sizeof(uint16_t);
+    // if the next batch would go past the end of the file, advance the loader
+    if (loader->current_position + (loader->num_processes * B * T + 1) * sizeof(uint16_t) > loader->file_size) {
+        dataloader_advance_(loader);
+    }
+}
+
+void dataloader_free(DataLoader *loader) {
+    free(loader->buffer);
+    free(loader->inputs);
+    free(loader->targets);
+    fcloseCheck(loader->tokens_file);
+    globfree(&loader->glob_result);
+}
+
+// ----------------------------------------------------------------------------
+// Distributed Eval Loader
+// Many evals (like) HellaSwag and MMLU are multiple-choice
+// where there are 4 possible continuations and a label for the correct one
+// We want to load and serve these style of evals
+/*
+Copy pasting the section on the eval datafile format, from data_common.py:
+- First comes a header with 256 int32s
+- The examples follow, each example is a stream of uint16_t:
+    - <START_EXAMPLE> delimiter of 2**16-1, i.e. 65,535
+    - <EXAMPLE_BYTES>, bytes encoding this example, allowing efficient skip to next
+    - <EXAMPLE_INDEX>, the index of the example in the dataset
+    - <LABEL>, the index of the correct completion
+    - <NUM_COMPLETIONS>, indicating the number of completions (usually 4)
+    - <NUM><CONTEXT_TOKENS>, where <NUM> is the number of tokens in the context
+    - <NUM><COMPLETION_TOKENS>, repeated NUM_COMPLETIONS times
+*/
+
+// for now, could relax later
+#define ASSUMED_NUM_COMPLETIONS 4
+// helper macro for ceildiv
+#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
+
+typedef struct {
+    // variables related to distributed training
+    // each process/worker has to access different parts of the data
+    int process_rank;
+    int num_processes;
+    // hyperparameters. use size_t to prevent overflow
+    size_t B; // (micro) batch size dimension of the tensor that feeds into the model
+    size_t T; // maximum context length of the model
+    // input handling and its state
+    FILE* eval_file;
+    int64_t file_size;
+    uint16_t* buffer; // we fread data from file into this buffer
+    // public variables that could be accessed from outside
+    int num_examples; // in total across all processes
+    int num_batches; // to process the entire dataset across all processes
+    int start_example_index; // the assignment of work for this process, start
+    int end_example_index; // and end. start is inclusive, end is exclusive
+    int current_example_index; // the next example we would read
+    int* inputs;  // input tokens into transformer
+    int* targets; // target tokens for the transformer
+    char* mask; // mask=1 at all completion token locations
+    int* label; // the correct completion labels
+    int num_completions; // number of completions for this example
+} EvalLoader;
+
+void evalloader_reset(EvalLoader *loader) {
+    // we have to be careful that each process starts at the correct offset.
+    // For example if there are N examples in the file and 4 processes,
+    // then process 0 should start at 0, process 1 at N/4, process 2 at N/2, etc.
+    // determine how much work there is for all processes
+    int examples_per_process = CEIL_DIV(loader->num_examples, loader->num_processes);
+    int can_fit_examples = loader->B / ASSUMED_NUM_COMPLETIONS;
+    loader->num_batches = CEIL_DIV(examples_per_process, can_fit_examples);
+    // determine the start and end example indices for this process
+    loader->start_example_index = examples_per_process * loader->process_rank;
+    loader->end_example_index = examples_per_process * (loader->process_rank + 1);
+    // crop the end example index to the total number of examples
+    if (loader->end_example_index > loader->num_examples) {
+        loader->end_example_index = loader->num_examples;
+    }
+    // now seek through the file to the start of that example
+    // utilize <EXAMPLE_BYTES> for efficiency
+    int64_t header_bytes = HEADER_SIZE * sizeof(int);
+    fseekCheck(loader->eval_file, header_bytes, SEEK_SET);
+    for (int i = 0; i < loader->start_example_index; i++) {
+        uint16_t example_header[3];
+        // read 3 uint16_t values: <START_EXAMPLE>, <EXAMPLE_BYTES>, <EXAMPLE_INDEX>
+        freadCheck(&example_header[0], sizeof(uint16_t), 3, loader->eval_file);
+        // validate the <START_EXAMPLE> delimiter
+        assert(example_header[0] == 65535); // <START_EXAMPLE> delimiter
+        // validate the <EXAMPLE_INDEX>
+        assert(example_header[2] == i); // <EXAMPLE_INDEX> should match the loop index
+        // skip to the next example, keeping in mind that we already read the header
+        size_t remaining_bytes = example_header[1] - sizeof(uint16_t) * 3;
+        assert(remaining_bytes > 0); // we expect some bytes in the example
+        fseekCheck(loader->eval_file, remaining_bytes, SEEK_CUR);
+    }
+    // now we are at the start of the example we want to start at, pointing at <START_EXAMPLE>
+    loader->current_example_index = loader->start_example_index;
+}
+
+void evalloader_init(EvalLoader *loader,
+                     const char* filename,
+                     size_t B,
+                     size_t T,
+                     int process_rank,
+                     int num_processes) {
+    loader->process_rank = process_rank;
+    loader->num_processes = num_processes;
+    loader->B = B;
+    loader->T = T;
+
+    // open the file and validate the header
+    loader->eval_file = fopenCheck(filename, "rb");
+    // validate the header
+    int header[HEADER_SIZE];
+    freadCheck(header, sizeof(int), HEADER_SIZE, loader->eval_file);
+    if (header[0] != 20240522) { printf("Bad magic in eval file\n"); exit(EXIT_FAILURE); }
+    if (header[1] != 1) { printf("Bad version in data file\n"); exit(EXIT_FAILURE); }
+    loader->num_examples = header[2]; // number of tokens in the file
+    assert(loader->num_examples >= num_processes); // avoid headaches for now
+    size_t longest_example_bytes = header[3]; // longest example in the file
+    // basic sensibility check we could relax later. but roughly each example
+    // contains the prompt (or "context") and 4 completions, all of these have to be
+    // up to T tokens, and their tokens are uint16_t (so 2 bytes/token).
+    // There's a few more things in each example but they are minor.
+    // So longest example should be roughly this. Just trying to make sure it's sensible.
+    assert(longest_example_bytes > 0 && longest_example_bytes < (1+ASSUMED_NUM_COMPLETIONS)*T*2);
+
+    // allocate all the space we'll need
+    int can_fit_examples = B / ASSUMED_NUM_COMPLETIONS;
+    loader->buffer = (uint16_t*)malloc(longest_example_bytes);
+    loader->inputs = (int*)malloc(B * T * sizeof(int));
+    loader->targets = (int*)malloc(B * T * sizeof(int));
+    loader->mask = (char*)malloc(B * T * sizeof(char));
+    loader->label = (int*)malloc(can_fit_examples * sizeof(int));
+
+    // reset the loader, to initialize it
+    evalloader_reset(loader);
+}
+
+void evalloader_next_example_(EvalLoader *loader, int example_batch_index) {
+    // this function populates the inputs, targets, mask, and label fields for one example
+    // because every (B,T) tensor can fit multiple examples and we want to take advantage,
+    // we also pass in the example_batch_index to indicate which example in the batch we are loading
+    // and each example takes up ASSUMED_NUM_COMPLETIONS rows in the batch
+    size_t B = loader->B;
+    size_t T = loader->T;
+    int batch_dim_offset = example_batch_index * ASSUMED_NUM_COMPLETIONS;
+    // read the current example header
+    uint16_t example_header[3];
+    freadCheck(&example_header[0], sizeof(uint16_t), 3, loader->eval_file);
+    // validate the <START_EXAMPLE> delimiter
+    assert(example_header[0] == 65535); // <START_EXAMPLE> delimiter
+    // validate the <EXAMPLE_INDEX>
+    assert(example_header[2] == loader->current_example_index); // <EXAMPLE_INDEX> should match the loop index
+    assert(example_header[2] >= loader->start_example_index && example_header[2] < loader->end_example_index);
+    // read the rest of the example (we have space for 3 more uint16_t values in buffer, it's ok)
+    size_t example_bytes = example_header[1] - sizeof(uint16_t) * 3;
+    // read example_bytes into buffer. careful that this is actually in the units of bytes
+    freadCheck(loader->buffer, sizeof(char), example_bytes, loader->eval_file);
+    // process the example label
+    int label = (int)loader->buffer[0];
+    int can_fit_examples = loader->B / ASSUMED_NUM_COMPLETIONS;
+    assert(label >= 0 && label < ASSUMED_NUM_COMPLETIONS); // we expect the label to be in [0, 4) for right now
+    assert(example_batch_index >= 0 && example_batch_index < can_fit_examples);
+    loader->label[example_batch_index] = label; // store for output
+    // process the number of completions
+    int num_completions = (int)loader->buffer[1];
+    assert(num_completions == ASSUMED_NUM_COMPLETIONS); // we expect 4 completions for now
+    assert(batch_dim_offset + num_completions <= B); // we expect to fit in the batch
+    loader->num_completions = num_completions; // store for output
+    // process the context
+    // the context is shared for all completions, so we insert it into all data rows equally
+    int context_length = (int)loader->buffer[2];
+    uint16_t *context_tokens_start = &loader->buffer[3]; // where the tokens start
+    assert(context_length > 0 && context_length < T); // context is non-empty and up to T
+    for (int b = 0; b < num_completions; b++) {
+        for (int i = 0; i < context_length; i++) {
+            int boff = batch_dim_offset + b;
+            int tok_cur = (int)context_tokens_start[i];
+            loader->inputs[boff * T + i] = tok_cur;
+        }
+    }
+    // process the completions, insert them in their row, right after the (shared) context
+    uint16_t *completions_iter = loader->buffer + 3 + context_length;
+    for (int c = 0; c < num_completions; c++) {
+        int coff = batch_dim_offset + c;
+        int completion_length = (int)completions_iter[0];
+        uint16_t *completion_tokens_start = completions_iter + 1;
+        assert(completion_length > 0 && context_length + completion_length < T); // things fit?
+        for (int i = 0; i < completion_length; i++) {
+            int tok_cur = (int)completion_tokens_start[i];
+            // at inputs, the completions simply follow the context
+            loader->inputs[coff * T + context_length + i] = tok_cur;
+            // at targets things start to get tricky
+            // we expect the last context token to predict the first completion token
+            // and then onwards from there.
+            loader->targets[coff * T + context_length + i - 1] = tok_cur;
+            // and at these positions, we want to set mask=1, because these are the
+            // positions where we want to average the loss, in each row, to determine
+            // its overall probability of following the context.
+            loader->mask[coff * T + context_length + i - 1] = 1;
+        }
+        completions_iter += 1 + completion_length; // move to the next completion
+    }
+    // advance the current example to point to the next one we'd load
+    loader->current_example_index += 1;
+}
+
+void evalloader_next_batch(EvalLoader *loader) {
+    size_t B = loader->B;
+    size_t T = loader->T;
+    // init all inputs, targets, mask to zeros
+    // TODO: I think only mask is necessary to reset?
+    memset(loader->inputs, 0, B * T * sizeof(int));
+    memset(loader->targets, 0, B * T * sizeof(int));
+    memset(loader->mask, 0, B * T * sizeof(char));
+    // ok here is the problem we are solving
+    // we have a batch dimension of B, which we want to take full advantage of
+    // each example has some number of completions (usually 4)
+    // so we want to pack as many examples into rows of B as we can fit
+    int can_fit_examples = B / ASSUMED_NUM_COMPLETIONS; // how many examples can we fit in the batch?
+    for (int i = 0; i < can_fit_examples; i++) {
+        if (loader->current_example_index >= loader->end_example_index) {
+            break; // this process has exhausted its work, noop from here on
+        }
+        evalloader_next_example_(loader, i);
+    }
+}
+
+int evalloader_stat_losses(EvalLoader *loader, float* losses) {
+    // compute statistics of losses (B*T) resulting from a forward pass
+    // on a batch that was constructed from EvalLoader
+    // putting this functionality here because it is tightly coupled
+    // with how we construct and represent the data batches.
+    // returns the number of correct examples in this batch.
+    int correct = 0;
+    size_t B = loader->B;
+    size_t T = loader->T;
+    // iterate the examples in this batch
+    int can_fit_examples = B / ASSUMED_NUM_COMPLETIONS;
+    for (int i = 0; i < can_fit_examples; i++) {
+        float min_loss;
+        int min_loss_index;
+        char active = 0; // is this example active or fully empty?
+        // iterate the completions in this example
+        for (int b = 0; b < ASSUMED_NUM_COMPLETIONS; b++) {
+            int boff = i * ASSUMED_NUM_COMPLETIONS + b;
+            // evaluate the quality of this completion
+            // its quality is simply the average loss over the tokens
+            float average_loss = 0.0f;
+            int count = 0;
+            for (int t = 0; t < T; t++) {
+                char mask = loader->mask[boff * T + t];
+                if (mask == 1) {
+                    active = 1;
+                    average_loss += losses[boff * T + t];
+                    count++;
+                }
+            }
+            if (count > 0) { average_loss /= count; }
+            if (b == 0 || average_loss < min_loss) {
+                min_loss = average_loss;
+                min_loss_index = b;
+            }
+        }
+        if (active && (min_loss_index == loader->label[i])) {
+            correct += 1;
+        }
+    }
+    return correct;
+}
+
+void evalloader_free(EvalLoader *loader) {
+    free(loader->buffer);
+    free(loader->inputs);
+    free(loader->targets);
+    free(loader->mask);
+    free(loader->label);
+    fcloseCheck(loader->eval_file);
+}
+
+#endif // DATALOADER_H
\ No newline at end of file
diff --git a/dev/cpu/matmul_forward.c b/dev/cpu/matmul_forward.c
new file mode 100644
index 000000000..f7b714326
--- /dev/null
+++ b/dev/cpu/matmul_forward.c
@@ -0,0 +1,217 @@
+/*
+CPU Kernels for matmul forward pass.
+*/
+
+// Compile Examples:
+//
+//      MSVC: cl.exe /O2 /fp:fast /Qvec-report:2 /I. /I ..\..\dev matmul_forward.c
+//            cl.exe /O2 /fp:fast /Qvec-report:2 /arch:AVX /I. /I ..\..\dev matmul_forward.c
+//            cl.exe /O2 /fp:fast /Qvec-report:2 /arch:AVX2 /I. /I ..\..\dev matmul_forward.c
+//
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <unistd.h>
+
+// ----------------------------------------------------------------------------
+// CPU code reference
+
+void matmul_forward_cpu(float* out,
+                    const float* inp, const float* weight, const float* bias,
+                    int B, int T, int C, int OC) {
+    // OC is short for "output channels"
+    // inp is (B,T,C), weight is (OC, C), bias is (OC)
+    // out will be (B,T,OC)
+    for (int b = 0; b < B; b++) {
+        for (int t = 0; t < T; t++) {
+            float* out_bt = out + b * T * OC + t * OC;
+            const float* inp_bt = inp + b * T * C + t * C;
+            for (int o = 0; o < OC; o++) {
+                float val = (bias != NULL) ? bias[o] : 0.0f;
+                const float* wrow = weight + o*C;
+                for (int i = 0; i < C; i++) {
+                    val += inp_bt[i] * wrow[i];
+                }
+                out_bt[o] = val;
+            }
+        }
+    }
+}
+
+void matmul_forward_ngc92(float* out,
+    const float* inp, const float* weight, const float* bias,
+    int B, int T, int C, int OC) {
+    // most of the running time is spent here and in matmul_backward
+    // OC is short for "output channels"
+    // inp is (B,T,C), weight is (OC, C), bias is (OC)
+    // out will be (B,T,OC)
+
+    // make sure the tiled loop will be correct, otherwise, fallback to slow version
+    #define LOOP_UNROLL 8
+
+    if (B * T % LOOP_UNROLL != 0) {
+        printf("MUST BE A MULTIPLE OF 8"); // FIXME
+        return;
+    }
+
+    // collapse the B and T loops into one and turn it into a strided loop.
+    // then we can tile the inner loop, and reuse the loaded weight LOOP_UNROLL many times
+    // for significant speed-ups.
+    for (int obt = 0; obt < B * T; obt += LOOP_UNROLL) {
+        for (int o = 0; o < OC; o++) {
+            // keep LOOP_UNROLL many results in register, initialized by the bias term.
+            float result[LOOP_UNROLL];
+            for (int ibt = 0; ibt < LOOP_UNROLL; ++ibt) {
+                result[ibt] = (bias != NULL) ? bias[o] : 0.0f;
+            }
+
+            // inner loops. Because we do LOOP_UNROLL steps of inner bt, we can cache
+            // the value of weight[i + o * C] and reuse it.
+            // we compile with -Ofast, so the compiler will turn the inner loop into a bunch of FMAs
+            for (int i = 0; i < C; i++) {
+                float w = weight[i + o * C];
+                for (int ibt = 0; ibt < LOOP_UNROLL; ++ibt) {
+                    int bt = obt + ibt;
+                    result[ibt] += inp[bt * C + i] * w;
+                }
+            }
+
+            // write back results to main memory
+            for (int ibt = 0; ibt < LOOP_UNROLL; ++ibt) {
+                int bt = obt + ibt;
+                out[bt * OC + o] = result[ibt];
+            }
+        }
+    }
+}
+
+#define NUM_KERNELS 2
+
+void matmul_forward(int kernel_num,
+    float* out,
+    const float* inp, const float* weight, const float* bias,
+    int B, int T, int C, int OC) {
+
+    switch (kernel_num) {
+        case 0:
+            matmul_forward_cpu(out, inp, weight, bias, B, T, C, OC);
+            break;
+        case 1:
+            matmul_forward_ngc92(out, inp, weight, bias, B, T, C, OC);
+            break;
+        default:
+            printf("Invalid kernel number\n");
+            exit(1);
+    }
+}
+
+
+void validate_results_cpu(const float* device_result, const float* cpu_reference, const char* name, int num_elements, float tolerance);
+float* make_random_float(size_t N);
+
+int main(int argc, char **argv) {
+    srand(0);
+
+    int B = 8;
+    int T = 1024;
+    int C = 768;
+    int OC = 768 * 4; // expansion of 4, e.g. in the MLP
+    int RUNS = 4; // number of times to run a kernel for benchmarks
+
+    srand(137);
+
+    float* out = make_random_float(B * T * OC);
+    float* inp = make_random_float(B * T * C);
+    float* weight = make_random_float(OC * C);
+    float* bias = make_random_float(OC);
+
+    float* grad_out = make_random_float(B * T * OC);
+    float* grad_inp = make_random_float(B * T * C);
+    float* grad_weight = make_random_float(OC * C);
+    float* grad_bias = make_random_float(OC);
+
+    printf("> Calculating reference\n");
+    matmul_forward_cpu(out, inp, weight, bias, B, T, C, OC);
+
+    for (int kernel_num = 0; kernel_num < NUM_KERNELS; kernel_num++) {
+        printf("> Verifying kernel #%d\n", kernel_num);
+
+        srand(137);
+
+        float* kernel_out = make_random_float(B * T * OC);
+        float* kernel_inp = make_random_float(B * T * C);
+        float* kernel_weight = make_random_float(OC * C);
+        float* kernel_bias = make_random_float(OC);
+
+        matmul_forward(kernel_num, kernel_out, kernel_inp, kernel_weight, kernel_bias, B, T, C, OC);
+
+        validate_results_cpu(kernel_out, out, "out", B * T * OC, 1e-5);
+
+        free(kernel_out);
+        free(kernel_inp);
+        free(kernel_weight);
+        free(kernel_bias);
+    }
+
+    printf("All kernels passed! Starting benchmarks.\n\n");
+
+    for (int kernel_num = 0; kernel_num < NUM_KERNELS; kernel_num++) {
+        printf("> Running kernel #%d\n", kernel_num);
+        struct timespec start, end;
+        clock_gettime(CLOCK_MONOTONIC, &start);
+
+        for (int i = 0; i < RUNS; i++) {
+            matmul_forward(kernel_num, out, inp, weight, bias, B, T, C, OC);
+        }
+
+        clock_gettime(CLOCK_MONOTONIC, &end);
+        double time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
+        printf("> Kernel #%d, (took %f ms)\n", kernel_num, time_elapsed_s * 1000);
+    }
+
+    // free memory
+    free(out);
+    free(inp);
+    free(weight);
+    free(bias);
+
+    free(grad_out);
+    free(grad_inp);
+    free(grad_weight);
+    free(grad_bias);
+
+    return 0;
+}
+
+float* make_random_float(size_t N) {
+    float* arr = (float*)malloc(N * sizeof(float));
+    for (size_t i = 0; i < N; i++) {
+        arr[i] = ((float)rand() / RAND_MAX) * 2.0 - 1.0; // range -1..1
+    }
+    return arr;
+}
+
+void validate_results_cpu(const float* kernel_result, const float* cpu_reference, const char* name, int num_elements, float tolerance) {
+    int nfaults = 0;
+    for (int i = 0; i < num_elements; i++) {
+        // print the first few comparisons
+        if (i < 5) {
+            printf("%f %f\n", cpu_reference[i], kernel_result[i]);
+        }
+        float t_eff = tolerance + fabs(cpu_reference[i]);
+        // ensure correctness for all elements.
+        if (fabs(cpu_reference[i] - kernel_result[i]) > t_eff) {
+            printf("Mismatch of %s at %d: CPU_ref: %f vs CPU_new: %f\n", name, i, cpu_reference[i], kernel_result[i]);
+            nfaults++;
+            if (nfaults >= 10) {
+                exit(EXIT_FAILURE);
+            }
+        }
+    }
+    if (nfaults > 0) {
+        exit(EXIT_FAILURE);
+    }
+    printf("OK\n");
+}
\ No newline at end of file
diff --git a/dev/cuda/Makefile b/dev/cuda/Makefile
index 4ea763762..14eae201e 100644
--- a/dev/cuda/Makefile
+++ b/dev/cuda/Makefile
@@ -18,7 +18,7 @@ MPI_PATHS = -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-
 	$(NVCC) $(CFLAGS) $(NVCCFLAGS) $< -o $@
 
 # Build all targets
-TARGETS = adamw attention_backward attention_forward classifier_fused crossentropy_forward crossentropy_softmax_backward encoder_backward encoder_forward gelu_backward gelu_forward layernorm_backward layernorm_forward matmul_backward matmul_backward_bias matmul_forward nccl_all_reduce residual_forward softmax_forward trimat_forward
+TARGETS = adamw attention_backward attention_forward classifier_fused crossentropy_forward crossentropy_softmax_backward encoder_backward encoder_forward gelu_backward gelu_forward layernorm_backward layernorm_forward matmul_backward matmul_backward_bias matmul_forward nccl_all_reduce residual_forward softmax_forward trimat_forward fused_residual_forward  global_norm
 all: $(TARGETS)
 
 # Individual targets: forward pass
@@ -26,9 +26,9 @@ attention_forward: attention_forward.cu
 classifier_fused: classifier_fused.cu
 crossentropy_forward: crossentropy_forward.cu
 encoder_forward: encoder_forward.cu
-gelu_backward: gelu_backward.cu
 gelu_forward: gelu_forward.cu
 layernorm_forward: layernorm_forward.cu
+fused_residual_forward: fused_residual_forward.cu
 residual_forward: residual_forward.cu
 softmax_forward: softmax_forward.cu
 trimat_forward: trimat_forward.cu
@@ -40,6 +40,7 @@ matmul_forward: matmul_forward.cu
 attention_backward: attention_backward.cu
 crossentropy_softmax_backward: crossentropy_softmax_backward.cu
 encoder_backward: encoder_backward.cu
+gelu_backward: gelu_backward.cu
 layernorm_backward: layernorm_backward.cu
 matmul_backward_bias: matmul_backward_bias.cu
 matmul_backward: matmul_backward.cu
@@ -47,6 +48,7 @@ matmul_backward: matmul_backward.cu
 
 # Update kernels
 adamw: adamw.cu
+global_norm: global_norm.cu
 
 # NCCL communication kernels
 nccl_all_reduce: nccl_all_reduce.cu
diff --git a/dev/cuda/adamw.cu b/dev/cuda/adamw.cu
index 23770b2c3..20a6560dd 100644
--- a/dev/cuda/adamw.cu
+++ b/dev/cuda/adamw.cu
@@ -6,8 +6,8 @@ References:
   * https://github.com/nvidia/apex/blob/master/csrc/multi_tensor_adam.cu
 
 Compile example:
-nvcc adamw.cu -o adamw
-nvcc -O3 --use_fast_math adamw.cu -o adamw
+nvcc -lcublas -lcublasLt adamw.cu -o adamw
+nvcc -O3 --use_fast_math -lcublas -lcublasLt adamw.cu -o adamw
 
 ./adamw
 
diff --git a/dev/cuda/attention_backward.cu b/dev/cuda/attention_backward.cu
index 8e673d79f..c97dbeee8 100644
--- a/dev/cuda/attention_backward.cu
+++ b/dev/cuda/attention_backward.cu
@@ -2,7 +2,7 @@
 Kernels for attention backward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math attention_backward.cu -o attention_backward -lcublas
+nvcc -O3 --use_fast_math -lcublas -lcublasLt attention_backward.cu -o attention_backward
 
 version 1 is a naive first version
 OMP_NUM_THREADS=32 ./attention_backward 1
diff --git a/dev/cuda/attention_forward.cu b/dev/cuda/attention_forward.cu
index a9325f085..b632b4a66 100644
--- a/dev/cuda/attention_forward.cu
+++ b/dev/cuda/attention_forward.cu
@@ -2,14 +2,14 @@
 Kernels for attention forward pass.
 
 If you do not have CUDNN, you can remove ENABLE_CUDNN to run the other kernels
-You need cuDNN from: https://developer.nvidia.com/cudnn
-And the cuDNN front-end from: https://github.com/NVIDIA/cudnn-frontend/tree/main
+
+See the README for cuDNN install instructions
 
 Compile example with cuDNN:
-nvcc -I/PATH/TO/cudnn-frontend/include -DENABLE_CUDNN -O3 --use_fast_math -lcublas -lcudnn attention_forward.cu -o attention_forward
+nvcc -I/PATH/TO/cudnn-frontend/include -DENABLE_CUDNN -O3 --use_fast_math --lcublas -lcublasLt -lcudnn attention_forward.cu -o attention_forward
 
 Compile example without cuDNN:
-nvcc -O3 --use_fast_math -lcublas attention_forward.cu -o attention_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt attention_forward.cu -o attention_forward
 
 version 1 is naive port from CPU code to kernel, parallelize over batch, time, heads only
 ./attention_forward 1
@@ -53,14 +53,9 @@ version 11 is kernel 10 skipping FP16/FP32 conversions (full FP16/BF16 network)
 #include <cuda_bf16.h>
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
-#include "common.h"
 
-// ----------------------------------------------------------------------------
-// Floating point precision setup
-typedef __nv_bfloat16 floatX; // half or __nv_bfloat16 (or float)
-#define CUBLAS_LOWP CUDA_R_16BF // CUDA_R_16F or CUDA_R_16BF (or CUDA_R_32F)
-// CUBLAS_COMPUTE_32F or CUBLAS_COMPUTE_16F (for CUDA_R_16F only, potentially slower?!)
-#define CUBLAS_LOWP_COMPUTE CUBLAS_COMPUTE_32F
+#define ENABLE_BF16
+#include "common.h"
 
 // ----------------------------------------------------------------------------
 // CUDA & cuDNN setup
@@ -245,14 +240,6 @@ __device__ float warpReduceMax(float val) {
     return val;
 }
 
-// warp-level reduction for summing values
-__device__ float warpReduceSum(float val) {
-    for (int offset = 16; offset > 0; offset /= 2) {
-        val += __shfl_down_sync(0xFFFFFFFF, val, offset);
-    }
-    return val;
-}
-
 __global__ void softmax_forward_kernel4(float* out, const float* inp, int N, int C) {
     // out is (N, C) just like inp. Each row of inp will get softmaxed.
     // same as kernel3, but can handle any block size (multiple of 32)
diff --git a/dev/cuda/classifier_fused.cu b/dev/cuda/classifier_fused.cu
index df6894113..2125b874d 100644
--- a/dev/cuda/classifier_fused.cu
+++ b/dev/cuda/classifier_fused.cu
@@ -5,7 +5,7 @@ much of a restriction: In pretraining, it is just a constant 1/batch_size tensor
 out the input prompt, but that is known in advance.
 
 Compile example:
-nvcc -O3 --use_fast_math classifier_fused.cu -o classifier_fused
+nvcc -O3 --use_fast_math -lcublas -lcublasLt classifier_fused.cu -o classifier_fused
 
 ./classifier_fused 1
 ./classifier_fused 2
@@ -21,13 +21,24 @@ nvcc -O3 --use_fast_math classifier_fused.cu -o classifier_fused
 #include <cooperative_groups/reduce.h>
 #include "common.h"
 
+// todo - this file does not properly support anything but FP32
+// kernel 5 can be run in fp16/bf16 to test performance, but the outputs will be wrong
+#if defined(ENABLE_BF16)
+typedef __nv_bfloat16 floatX;
+#elif defined(ENABLE_FP16)
+typedef half floatX;
+#else
+typedef float floatX;
+#endif
+typedef Packed128<floatX> x128;
+
 // ----------------------------------------------------------------------------
 // CPU code reference
 
 void softmax_forward_cpu(float* out, const float* inp, int N, int C) {
     // inp is (N, C)
     // out is (N, C), each row of inp will get softmaxed
-    for (int i = 0; i < N; i++) {
+    for (int64_t i = 0; i < N; i++) {
         const float* inp_row = inp + i * C;
         float* out_row = out + i * C;
 
@@ -55,13 +66,11 @@ void crossentropy_forward_cpu(float* losses,
     // output: losses is (B,T) of the individual losses at each position
     // input: probs are (B,T,V) of the probabilities
     // input: targets is (B,T) of integers giving the correct index in logits
-    for (int b = 0; b < B; b++) {
-        for (int t = 0; t < T; t++) {
-            // loss = -log(probs[target])
-            const float* probs_bt = probs + b * T * V + t * V;
-            int ix = targets[b * T + t];
-            losses[b * T + t] = -logf(probs_bt[ix]);
-        }
+    for (int64_t bt = 0; bt < B * T; bt++) {
+        // loss = -log(probs[target])
+        const float* probs_bt = probs + bt * V;
+        int ix = targets[bt];
+        losses[bt] = -logf(probs_bt[ix]);
     }
 }
 
@@ -69,17 +78,15 @@ void crossentropy_softmax_backward_cpu(float* dlogits,
                                        const float* dlosses, const float* probs, const int* targets,
                                        int B, int T, int V) {
     // backwards through both softmax and crossentropy
-    for (int b = 0; b < B; b++) {
-        for (int t = 0; t < T; t++) {
-            float* dlogits_bt = dlogits + b * T * V + t * V;
-            const float* probs_bt = probs + b * T * V + t * V;
-            float dloss = dlosses[b * T + t];
-            int ix = targets[b * T + t];
-            for (int i = 0; i < V; i++) {
-                float p = probs_bt[i];
-                float indicator = i == ix ? 1.0f : 0.0f;
-                dlogits_bt[i] = (p - indicator) * dloss;
-            }
+    for (int64_t bt = 0; bt < B * T; bt++) {
+        float* dlogits_bt = dlogits + bt * V;
+        const float* probs_bt = probs + bt * V;
+        float dloss = dlosses[bt];
+        int ix = targets[bt];
+        for (int i = 0; i < V; i++) {
+            float p = probs_bt[i];
+            float indicator = i == ix ? 1.0f : 0.0f;
+            dlogits_bt[i] = (p - indicator) * dloss;
         }
     }
 }
@@ -87,14 +94,6 @@ void crossentropy_softmax_backward_cpu(float* dlogits,
 // ----------------------------------------------------
 // Kernel Utils
 
-// warp-level reduction for summing values
-__device__ float warpReduceSum(float val) {
-    for (int offset = 16; offset > 0; offset /= 2) {
-        val += __shfl_xor_sync(0xFFFFFFFF, val, offset);
-    }
-    return val;
-}
-
 // warp-level reduction for finding the maximum value
 __device__ float warpReduceMax(float val) {
     for (int offset = 16; offset > 0; offset /= 2) {
@@ -112,7 +111,7 @@ struct SoftmaxParams {
 };
 namespace cg = cooperative_groups;
 __device__ SoftmaxParams prepare_softmax(cg::thread_block_tile<32>& warp,
-                                         int idx, const float* inp, int V, int P) {
+                                         int64_t idx, const float* inp, int V, int P) {
     // this warp (of 32) threads processes one row of inp, i.e. inp[idx, :] of shape (V,)
     // note that inp is actually (B * T, P) but we only use the first V elements
     // this function tehen calculates:
@@ -152,7 +151,7 @@ __global__ void fused_classifier_kernel1(float* dlogits, float* losses,
     // each block of 4 warps is in charge of 4 rows of the input, one warp per row
     // meta_group_size is the number of warps per block (e.g. 4)
     // meta_group_rank is the index of the warp in the block (e.g. 0, 1, 2, 3)
-    int idx = blockIdx.x * warp.meta_group_size() + warp.meta_group_rank();
+    int64_t idx = blockIdx.x * warp.meta_group_size() + warp.meta_group_rank();
     if (idx >= B * T) { // there are B * T rows in the input
         return;
     }
@@ -189,7 +188,7 @@ __device__ float vec_at(const float4& vec, int index) {
 }
 
 __device__ SoftmaxParams prepare_softmax_blockwide(cg::thread_block_tile<32>& warp,
-                                                   int idx, const float* inp, int V, int P) {
+                                                   int64_t idx, const float* inp, int V, int P) {
     // one row of inp, i.e. inp[idx, :] of shape (V,)
     // float4 to get 128-bit loads and memory level parallelism
     const float4* x_vec4 = reinterpret_cast<const float4*>(inp + idx * P);
@@ -198,7 +197,7 @@ __device__ SoftmaxParams prepare_softmax_blockwide(cg::thread_block_tile<32>& wa
     float thread_sumval = 0.0f;
     // do the loop in reverse to maximise probability of L2 cache hits
     // so even small L2s get some hits on the 2nd read of the same thread
-    for (int i = (V+3)/4 + threadIdx.x - blockDim.x; i >= 0; i -= blockDim.x) {
+    for (int i = ceil_div(V, 4) + threadIdx.x - blockDim.x; i >= 0; i -= blockDim.x) {
         float4 v4 = x_vec4[i];
         #pragma unroll
         for(int k = 0; k < 4; k++) {
@@ -207,7 +206,7 @@ __device__ SoftmaxParams prepare_softmax_blockwide(cg::thread_block_tile<32>& wa
             }
             float old_maxval = thread_maxval;
             thread_maxval = fmaxf(thread_maxval, vec_at(v4, k));
-            thread_sumval *= expf((old_maxval - thread_maxval));
+            thread_sumval *= expf(old_maxval - thread_maxval);
             thread_sumval += expf(vec_at(v4, k) - thread_maxval);
         }
     }
@@ -253,7 +252,7 @@ __global__ void fused_classifier_kernel2(float* dlogits, float* losses, float* p
     namespace cg = cooperative_groups;
     cg::thread_block block = cg::this_thread_block();
     cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
-    int idx = blockIdx.x;
+    int64_t idx = blockIdx.x;
     int ix = targets[idx];
 
     // softmax (reading B * T * V, same logits read again below, hopefully still in cache)
@@ -270,7 +269,7 @@ __global__ void fused_classifier_kernel2(float* dlogits, float* losses, float* p
     // calculate the gradients directly, saves bandwidth from probs during training
     // but also supports writing probs for inference-only and debugging
     const float4* logits_vec4 = reinterpret_cast<const float4*>(logits + idx * P);
-    for (int i = threadIdx.x; i < (V+3)/4; i += blockDim.x) {
+    for (int i = threadIdx.x; i < ceil_div(V, 4); i += blockDim.x) {
         // this is the 2nd read of logits after the one in prepare_softmax2
         // this data will never be needed again, so we reduce cache persistence
         float4 v4 = __ldcs(&logits_vec4[i]);
@@ -294,7 +293,7 @@ __global__ void fused_classifier_kernel2(float* dlogits, float* losses, float* p
 }
 
 __device__ SoftmaxParams prepare_softmax_blockwide_nofloat4(cg::thread_block_tile<32>& warp,
-                                                   int idx, const float* inp, int V, int P) {
+                                                            int64_t idx, const float* inp, int V, int P) {
     // same but not float4
     // one row of inp, i.e. inp[idx, :] of shape (V,)
 
@@ -307,7 +306,7 @@ __device__ SoftmaxParams prepare_softmax_blockwide_nofloat4(cg::thread_block_til
         float v = x[i];
         float old_maxval = thread_maxval;
         thread_maxval = fmaxf(thread_maxval, v);
-        thread_sumval *= expf((old_maxval - thread_maxval));
+        thread_sumval *= expf(old_maxval - thread_maxval);
         thread_sumval += expf(v - thread_maxval);
     }
 
@@ -350,7 +349,7 @@ __global__ void fused_classifier_kernel3(float* dlogits, float* losses, float* p
     namespace cg = cooperative_groups;
     cg::thread_block block = cg::this_thread_block();
     cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
-    int idx = blockIdx.x;
+    int64_t idx = blockIdx.x;
     int ix = targets[idx];
 
     // softmax (reading B * T * V, same logits read again below, hopefully still in cache)
@@ -382,24 +381,24 @@ __global__ void fused_classifier_kernel3(float* dlogits, float* losses, float* p
     }
 }
 
-__device__ SoftmaxParams prepare_softmax_blockwide2(int idx, const float* inp, int V, int P) {
+__device__ SoftmaxParams prepare_softmax_blockwide2(int64_t idx, const floatX* inp, int V, int P) {
     // one row of inp, i.e. inp[idx, :] of shape (V,)
 
-    const float* x = inp + idx * P;
+    const floatX* x = inp + idx * P;
     float thread_maxval = -INFINITY;
     float thread_sumval = 0.0f;
     // do the loop in reverse to maximise probability of L2 cache hits
     // so even small L2s get some hits on the 2nd read of the same thread
-    for (int i = (V+3)/4 + threadIdx.x - blockDim.x; i >= 0; i -= blockDim.x) {
-        f128 packed_x = load128cs(x + i * f128::size); // load and do not keep in cache
+    for (int i = ceil_div(V, x128::size) + threadIdx.x - blockDim.x; i >= 0; i -= blockDim.x) {
+        x128 packed_x = load128cs(x + i * x128::size); // load and do not keep in cache
         for(int k = 0; k < packed_x.size; ++k) {
-            if (i*4+k >= V) {  // bounds checking against real V
+            if (i*x128::size+k >= V) {  // bounds checking against real V
                 continue;
             }
             float v = (float)packed_x[k];
             float old_maxval = thread_maxval;
             thread_maxval = fmaxf(thread_maxval, v);
-            thread_sumval *= expf((old_maxval - thread_maxval));
+            thread_sumval *= expf(old_maxval - thread_maxval);
             thread_sumval += expf(v - thread_maxval);
         }
     }
@@ -436,11 +435,11 @@ __device__ SoftmaxParams prepare_softmax_blockwide2(int idx, const float* inp, i
     return SoftmaxParams{1.f / block_sumval, block_maxval};
 }
 
-// same as 2 but not using float4
-__global__ void fused_classifier_kernel4(float* dlogits, float* losses, float* probs,
-                                         const float* logits, const float* dlosses, const int* targets,
+// same as 2 but using x128
+__global__ void fused_classifier_kernel4(floatX* dlogits, floatX* losses, floatX* probs,
+                                         const floatX* logits, const floatX* dlosses, const int* targets,
                                          int B, int T, int V, int P) {
-    int idx = blockIdx.x;
+    int64_t idx = blockIdx.x;
     int ix = targets[idx];
 
     // softmax (reading B * T * V, same logits read again below, hopefully still in cache)
@@ -448,21 +447,21 @@ __global__ void fused_classifier_kernel4(float* dlogits, float* losses, float* p
 
     // calculate the probability needed for the loss and update (single-threaded)
     if(threadIdx.x == 0) {
-        float prob = expf(logits[idx * P + ix] - sp.Offset) * sp.Scale;
+        float prob = expf((float)logits[idx * P + ix] - sp.Offset) * sp.Scale;
         losses[idx] = -logf(prob);
     }
 
     // very sensible default for dlosses is 1/(B*T), which is the uniform loss
-    float dloss = dlosses != NULL ? dlosses[idx] : 1.0f / (B*T);
+    float dloss = dlosses != NULL ? (float)dlosses[idx] : 1.0f / (B*T);
     // calculate the gradients directly, saves bandwidth from probs during training
     // but also supports writing probs for inference-only and debugging
-    const float* logits_vec = logits + idx * P;
-    for (int i = threadIdx.x; i < (V+f128::size-1)/f128::size; i += blockDim.x) {
+    const floatX* logits_vec = logits + idx * P;
+    for (int i = threadIdx.x; i < ceil_div(V , x128::size); i += blockDim.x) {
         // this is the 2nd read of logits after the one in prepare_softmax2
         // this data will never be needed again, so we reduce cache persistence
-        f128 packed_logits_vec = load128cs(logits_vec + i * f128::size); // load and do not keep in cache
-        f128 packed_probs;
-        f128 packed_dlogits;
+        x128 packed_logits_vec = load128cs(logits_vec + i * x128::size); // load and do not keep in cache
+        x128 packed_probs;
+        x128 packed_dlogits;
         for(int k = 0; k < packed_logits_vec.size; ++k) {
             int element = i*packed_logits_vec.size + k;
             if (element >= V) {  // bounds checking against real V
@@ -474,6 +473,7 @@ __global__ void fused_classifier_kernel4(float* dlogits, float* losses, float* p
             float indicator = (element == ix) ? 1.0f : 0.0f;
             packed_dlogits[k] = (prob - indicator) * dloss;
         }
+        // Note: missing .cs hint hurts our performance due to cache thrashing, fixed in kernel5
         store128(dlogits + idx * P + i * packed_logits_vec.size, packed_dlogits);
         if (probs != NULL) {
             store128(probs + idx * P + i * packed_logits_vec.size, packed_probs);
@@ -481,6 +481,142 @@ __global__ void fused_classifier_kernel4(float* dlogits, float* losses, float* p
     }
 }
 
+// todo - move to common.h - or ideally somewhere it's not duplicated between train & common?
+// requires all 32 threads in the warp to be active, but should work for any block size
+// uses non-dynamic shared memory so every call increases shared memory requirements by 128 bytes
+// the fact it's unique shared memory allows us to avoid an extra __syncthreads() call at the end
+// but if called inside a loop, the shared memory will be implicitly reused, so set final_sync to 1
+using reduction_func_t = float (*) (float);
+template<reduction_func_t warp_reduction>
+__device__ float blockReduce(float val, bool final_sync=false, float out_of_bounds=0.0f) {
+    // two reductions of up to 1024 threads:
+    // 1) inside warp (shuffle), 2) cross-warp (shared memory), 3) inside warp (shuffle)
+    __shared__ float shared_val[32];
+    const int lane_id = threadIdx.x % 32;
+    const int warp_id = threadIdx.x / 32;
+    const int num_warps = blockDim.x / 32;
+
+    float warp_val = warp_reduction(val);
+    if (lane_id == 0) { shared_val[warp_id] = warp_val; }
+    __syncthreads();
+    warp_val = (lane_id < num_warps) ? shared_val[lane_id] : out_of_bounds;
+    float block_val = warp_reduction(warp_val);
+
+    if (final_sync) {
+        __syncthreads(); // only needed in loops when effectively reusing shared memory etc.
+    }
+    return block_val;
+}
+
+__device__ SoftmaxParams prepare_softmax_blockwide3(int64_t idx, const floatX* inp, int V, int P) {
+    // same but not float4
+    // one row of inp, i.e. inp[idx, :] of shape (V,)
+
+    const floatX* x = inp + idx * P;
+    float thread_maxval = -INFINITY;
+    float thread_sumval = 0.0f;
+    int i = (V+x128::size-1)/x128::size + threadIdx.x - blockDim.x;
+
+    // special-case loop to handle the unaligned elements at the end of the array
+    // this lets us skip the bounds check in the main loop below, which improves performance
+    while ((i+1)*x128::size > V) {
+        for(int k = 0; k < x128::size; ++k) {
+            if (i*x128::size+k >= V) {
+                break; // bounds checking against real V (rather than padded P)
+            }
+            float v = (float)x[i*x128::size+k];
+            float old_maxval = thread_maxval;
+            thread_maxval = fmaxf(thread_maxval, v);
+            thread_sumval *= expf((old_maxval - thread_maxval));
+            thread_sumval += expf(v - thread_maxval);
+        }
+        i -= blockDim.x;
+    }
+
+    // main loop for the bulk of the iterations (no bounds checking required!)
+    for (; i >= 0; i -= blockDim.x) {
+        x128 packed_x = load128(x + i * x128::size); // load and keep in cache until fused_classifier loop
+        for(int k = 0; k < x128::size; ++k) {
+            float v = (float)packed_x[k];
+            float old_maxval = thread_maxval;
+            thread_maxval = fmaxf(thread_maxval, v);
+            thread_sumval *= expf((old_maxval - thread_maxval));
+            thread_sumval += expf(v - thread_maxval);
+        }
+    }
+
+    // Block Max Reduction -> Maths -> Block Sum Reduction
+    float block_maxval = blockReduce<warpReduceMax>(thread_maxval, false, -FLT_MAX);
+    thread_sumval *= expf(thread_maxval - block_maxval);
+    float block_sumval = blockReduce<warpReduceSum>(thread_sumval);
+
+    // return the softmax parameters
+    return SoftmaxParams{1.f / block_sumval, block_maxval};
+}
+
+// will _update_ logits to logit gradients
+// uses template to decide whether to write logits and probs
+// split both loops in "multiple-of-x128-size" and "bounds-checked remainder" parts
+template <bool WriteLogits = true, bool WriteProbs = false>
+__global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS)
+                fused_classifier_kernel5(floatX* dlogits, floatX* losses, floatX* probs,
+                                         const floatX* logits, const floatX* dlosses, const int* targets,
+                                         int B, int T, int V, int P) {
+    int64_t idx = blockIdx.x;
+    int ix = targets[idx];
+
+    // softmax (reading B * T * V, same logits read again below, hopefully still in cache)
+    SoftmaxParams sp = prepare_softmax_blockwide3(idx, logits, V, P);
+
+    // calculate the probability needed for the loss and update (single-threaded)
+    if(threadIdx.x == 0) {
+        float prob = expf((float)logits[idx * P + ix] - sp.Offset) * sp.Scale;
+        losses[idx] = (floatX)(-logf(prob));
+    }
+
+    // very sensible default for dlosses is 1/(B*T), which is the uniform loss
+    float dloss = (dlosses != NULL) ? (float)dlosses[idx] : 1.0f / (B*T);
+    // calculate the gradients directly, saves bandwidth from probs during training
+    // but also supports writing probs for inference-only and debugging
+    const floatX* logits_vec = logits + idx * P;
+    for (int i = threadIdx.x; i < V/x128::size; i += blockDim.x) {
+        // this is the 2nd read of logits after the one in prepare_softmax2
+        // it will be overwritten by the logits gradients which is when we reduce cache persistence
+        x128 packed_logits_vec = load128(logits_vec + i * x128::size); // rely on cs of store128cs
+        x128 packed_probs;
+        for(int k = 0; k < x128::size; ++k) {
+            int element = i*x128::size + k;
+            float prob = expf((float)packed_logits_vec[k] - sp.Offset) * sp.Scale;
+            packed_probs[k] = (floatX)prob;
+            float indicator = (element == ix) ? 1.0f : 0.0f;
+            packed_logits_vec[k] = (floatX)((prob - indicator) * dloss);
+        }
+        if (WriteLogits){
+            // reduce cache persistence for the overwritten logits
+            // to maximise probability that logits remain in cache between prepare_softmax and here
+            store128cs(dlogits + idx * P + i * x128::size, packed_logits_vec);
+        }
+        if (WriteProbs) {
+            store128(probs + idx * P + i * x128::size, packed_probs);
+        }
+    }
+
+    // handle remaining elements after the last multiple of x128::size
+    // e.g. if V = 8003, and x128::size = 8, we need to handle the last 3 elements
+    int unaligned_start = V & ~(x128::size - 1); // round down to multiple of x128::size
+    for (int i = threadIdx.x + unaligned_start; i < V; i++) {
+        float prob = expf((float)logits_vec[i] - sp.Offset) * sp.Scale;
+        float indicator = (i == ix) ? 1.0f : 0.0f;
+        float dlogit = (prob - indicator) * dloss;
+        if (WriteLogits){
+            __stcs(dlogits + idx * P + i, (floatX)dlogit);
+        }
+        if (WriteProbs) {
+            probs[idx * P + i] = (floatX)prob;
+        }
+    }
+}
+
 // ----------------------------------------------------------------------------
 // kernel launcher
 
@@ -519,7 +655,16 @@ void fused_classifier4(float* dlogits, float* losses,
                       int B, int T, int V, int P, int block_size) {
     const int N = B * T;
     const int grid_size = N;
-    fused_classifier_kernel4<<<grid_size, block_size>>>(dlogits, losses, NULL, logits, dlosses, targets, B, T, V, P);
+    fused_classifier_kernel4<<<grid_size, block_size>>>((floatX*)dlogits, (floatX*)losses, NULL, (floatX*)logits, (floatX*)dlosses, targets, B, T, V, P);
+    cudaCheck(cudaGetLastError());
+}
+
+void fused_classifier5(float* dlogits, float* losses,
+                      const float* logits, const float* dlosses, const int* targets,
+                      int B, int T, int V, int P, int block_size) {
+    const int N = B * T;
+    const int grid_size = N;
+    fused_classifier_kernel5<true,false><<<grid_size, block_size, 512>>>((floatX*)dlogits, (floatX*)losses, NULL, (floatX*)logits, (floatX*)dlosses, targets, B, T, V, P);
     cudaCheck(cudaGetLastError());
 }
 
@@ -539,6 +684,9 @@ void fused_classifier(int kernel_num, float* dlogits, float* losses,
         case 4:
             fused_classifier4(dlogits, losses, logits, dlosses, targets, B, T, V, P, block_size);
             break;
+        case 5:
+            fused_classifier5(dlogits, losses, logits, dlosses, targets, B, T, V, P, block_size);
+            break;
         default:
             printf("Invalid kernel number\n");
             exit(1);
@@ -550,10 +698,10 @@ void fused_classifier(int kernel_num, float* dlogits, float* losses,
 int main(int argc, char **argv) {
     srand(0);
 
-    int B = 8;              // batch size
-    int T = 1024;           // sequence length
-    int V = 50257;          // vocab size
-    int P = (V + 63) & ~63; // padded vocab size, up to nearest multiple of 64
+    int64_t B = 8;              // batch size
+    int64_t T = 1024;           // sequence length
+    int64_t V = 50257;          // vocab size
+    int64_t P = (V + 63) & ~63; // padded vocab size, up to nearest multiple of 64
 
     int deviceIdx = 0;
     cudaCheck(cudaSetDevice(deviceIdx));
@@ -606,17 +754,22 @@ int main(int argc, char **argv) {
     crossentropy_forward_cpu(losses, probs, targets, B, T, V);
     crossentropy_softmax_backward_cpu(dlogits, dlosses, probs, targets, B, T, V);
 
-    // time the kernel at different block sizes
-    for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
-        int block_size = block_sizes[j];
-        printf("Checking block size %d.\n", block_size);
-        fused_classifier(kernel_num, d_dlogits, d_losses, d_logits, d_dlosses, d_targets, B, T, V, P, block_size);
-        validate_result(d_losses, losses, "losses", B * T, 1e-4f);
-        // undo the padding before we can check for correctness
-        cudaCheck(cudaMemcpy2D(d_dlogits_no_pad, V * sizeof(float), d_dlogits, P * sizeof(float), V * sizeof(float), B * T, cudaMemcpyDeviceToDevice));
-        validate_result(d_dlogits_no_pad, dlogits, "dlogits", B * T * V, 1e-4f);
+#if defined(ENABLE_BF16) || defined(ENABLE_FP16)
+    if (kernel_num < 4) // kernel 4/5 + BF16 is only for testing performance, it doesn't do the format conversions yet etc...
+#endif
+    {
+        // time the kernel at different block sizes
+        for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
+            int block_size = block_sizes[j];
+            printf("Checking block size %d.\n", block_size);
+            fused_classifier(kernel_num, d_dlogits, d_losses, d_logits, d_dlosses, d_targets, B, T, V, P, block_size);
+            validate_result(d_losses, losses, "losses", B * T, 1e-4f);
+            // undo the padding before we can check for correctness
+            cudaCheck(cudaMemcpy2D(d_dlogits_no_pad, V * sizeof(float), d_dlogits, P * sizeof(float), V * sizeof(float), B * T, cudaMemcpyDeviceToDevice));
+            validate_result(d_dlogits_no_pad, dlogits, "dlogits", B * T * V, 1e-4f);
+        }
+        printf("All results match. Starting benchmarks.\n\n");
     }
-    printf("All results match. Starting benchmarks.\n\n");
 
     for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
         int block_size = block_sizes[j];
diff --git a/dev/cuda/common.h b/dev/cuda/common.h
index 77e012fcd..f78e140a5 100644
--- a/dev/cuda/common.h
+++ b/dev/cuda/common.h
@@ -3,6 +3,7 @@
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 #include <cublasLt.h>
+#include <float.h>
 
 
 template<class T>
@@ -10,6 +11,13 @@ __host__ __device__ T ceil_div(T dividend, T divisor) {
     return (dividend + divisor-1) / divisor;
 }
 
+__device__ float warpReduceSum(float val) {
+    for (int offset = 16; offset > 0; offset /= 2) {
+        val += __shfl_xor_sync(0xFFFFFFFF, val, offset);
+    }
+    return val;
+}
+
 // ----------------------------------------------------------------------------
 // checking utils
 
@@ -48,6 +56,14 @@ int cuda_arch_minor = 0;
 int cuda_num_SMs = 0; // for persistent threads where we want 1 threadblock per SM
 int cuda_threads_per_SM = 0;    // needed to calculate how many blocks to launch to fill up the GPU
 
+// ----------------------------------------------------------------------------
+// to make sure that 2 blocks fit on A100/H100 to maximise latency tolerance
+#if __CUDA_ARCH__ == 800 || __CUDA_ARCH__ >= 900
+#define MAX_1024_THREADS_BLOCKS 2
+#else
+#define MAX_1024_THREADS_BLOCKS 1
+#endif
+
 // ----------------------------------------------------------------------------
 // Packed128 data structure, which forces the compiler to use 128-bit loads/stores
 // in GPUs that support (the LDG.128 and STS.128 instructions)
@@ -56,7 +72,9 @@ int cuda_threads_per_SM = 0;    // needed to calculate how many blocks to launch
 
 template<class ElementType>
 struct alignas(16) Packed128 {
-    __device__ Packed128() = default;
+    // Note: = default implicitly generates a __device__ function, but explicitly
+    // adding __device__ causes a lot of warnings.
+    Packed128() = default;
     __device__ explicit Packed128(int4 bits) {
         static_assert(sizeof(bits) == sizeof(payload), "Size mismatch.");
         memcpy(&payload, &bits, sizeof(bits));
@@ -88,24 +106,67 @@ template<class ElementType>
 __device__ Packed128<ElementType> load128(const ElementType* address) {
     return Packed128<ElementType>{*reinterpret_cast<const int4*>(address)};
 }
-
 // load a Packed128 from an aligned memory address with streaming cache hint
 template<class ElementType>
 __device__ Packed128<ElementType> load128cs(const ElementType* address) {
     return Packed128<ElementType>{__ldcs(reinterpret_cast<const int4*>(address))};
 }
-
 // store a Packed128 to an aligned memory address
 template<class ElementType>
 __device__ void store128(ElementType* target, Packed128<ElementType> value) {
     *reinterpret_cast<int4*>(target) = value.get_bits();
 }
-
 // store a Packed128 to an aligned memory address with streaming cache hint
 template<class ElementType>
 __device__ void store128cs(ElementType* target, Packed128<ElementType> value) {
     __stcs(reinterpret_cast<int4*>(target), value.get_bits());
 }
+// store a Packed128 to an aligned memory address while caching in L2 but bypassing L1
+template<class ElementType>
+__device__ void store128cg(ElementType* target, Packed128<ElementType> value) {
+    __stcg(reinterpret_cast<int4*>(target), value.get_bits());
+}
+
+// ----------------------------------------------------------------------------
+// reduced/mixed precision utilities
+
+#if defined(ENABLE_BF16)
+
+typedef __nv_bfloat16 floatX;
+typedef __nv_bfloat16 floatN;
+#define CUBLAS_LOWP CUDA_R_16BF // CUDA_R_16F or CUDA_R_16BF (or CUDA_R_32F)
+// CUBLAS_COMPUTE_32F or CUBLAS_COMPUTE_16F (for CUDA_R_16F only, potentially slower?!)
+#define CUBLAS_LOWP_COMPUTE CUBLAS_COMPUTE_32F
+
+#elif defined(ENABLE_FP16)
+
+typedef half floatX;
+typedef half floatN;
+
+#else
+
+typedef float floatX;
+typedef float floatN;
+#endif
+
+typedef Packed128<floatX> x128;
+
+
+// older nvcc does not provide __ldcs and __stcs for bfloat16, despite these actually just being unsigned shorts.
+// we need to be careful here to only define our own versions if none already exist, otherwise the compiler will
+// complain.
+// If not, you easily get "no viable overload" (for sm52) and "function already exists" (sm_80)
+#if defined(ENABLE_BF16) && (__CUDACC_VER_MAJOR__ < 12) && !((__CUDA_ARCH__ >= 800) || !defined(__CUDA_ARCH__))
+__device__ floatX __ldcs(const floatX* address) {
+    unsigned short bf = __ldcs(reinterpret_cast<const unsigned short*>(address));
+    return __nv_bfloat16_raw{bf};
+}
+
+__device__ void __stcs(floatX* address, floatX value) {
+    __stcs(reinterpret_cast<unsigned short*>(address), ((__nv_bfloat16_raw)value).x);
+}
+#endif
+
 
 // ----------------------------------------------------------------------------
 // random utils
@@ -200,13 +261,25 @@ void validate_result(D* device_result, const T* cpu_reference, const char* name,
     D* out_gpu = (D*)malloc(num_elements * sizeof(D));
     cudaCheck(cudaMemcpy(out_gpu, device_result, num_elements * sizeof(D), cudaMemcpyDeviceToHost));
     int nfaults = 0;
+#ifndef ENABLE_BF16
+    float epsilon = FLT_EPSILON;
+#else
+    float epsilon = 0.079;
+#endif
     for (int i = 0; i < num_elements; i++) {
+        // Skip masked elements
+        if(!isfinite(cpu_reference[i]))
+            continue;
+
         // print the first few comparisons
         if (i < 5) {
             printf("%f %f\n", cpu_reference[i], (T)out_gpu[i]);
         }
-        // ensure correctness for all elements. We can set an "ignore" mask by writing NaN
-        if (fabs(cpu_reference[i] - (T)out_gpu[i]) > tolerance && isfinite(cpu_reference[i])) {
+        // effective tolerance is based on expected rounding error (epsilon),
+        // plus any specified additional tolerance
+        float t_eff = tolerance + fabs(cpu_reference[i]) * epsilon;
+        // ensure correctness for all elements.
+        if (fabs(cpu_reference[i] - (T)out_gpu[i]) > t_eff) {
             printf("Mismatch of %s at %d: CPU_ref: %f vs GPU: %f\n", name, i, cpu_reference[i], (T)out_gpu[i]);
             nfaults ++;
             if (nfaults >= 10) {
@@ -216,10 +289,10 @@ void validate_result(D* device_result, const T* cpu_reference, const char* name,
         }
     }
 
-    // reset the result pointer, so we can chain multiple tests and don't miss trivial errors,
-    // like the kernel not writing to part of the result.
-    // cudaMemset(device_result, 0, num_elements * sizeof(T));
-    // AK: taking this out, ~2 hours of my life was spent finding this line
+    if (nfaults > 0) {
+        free(out_gpu);
+        exit(EXIT_FAILURE);
+    }
 
     free(out_gpu);
 }
diff --git a/dev/cuda/crossentropy_forward.cu b/dev/cuda/crossentropy_forward.cu
index 2385a6c4f..ca312ba36 100644
--- a/dev/cuda/crossentropy_forward.cu
+++ b/dev/cuda/crossentropy_forward.cu
@@ -2,7 +2,7 @@
 Kernels for crossentropy forward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math crossentropy_forward.cu -o crossentropy_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt crossentropy_forward.cu -o crossentropy_forward
 
 version 1 is a straight-forward port from CPU code to kernel, parallel over B,T
 ./crossentropy_forward 1
diff --git a/dev/cuda/crossentropy_softmax_backward.cu b/dev/cuda/crossentropy_softmax_backward.cu
index 164bceddf..27521bf60 100644
--- a/dev/cuda/crossentropy_softmax_backward.cu
+++ b/dev/cuda/crossentropy_softmax_backward.cu
@@ -2,7 +2,7 @@
 Kernels for crossentropy forward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math crossentropy_softmax_backward.cu -o crossentropy_softmax_backward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt crossentropy_softmax_backward.cu -o crossentropy_softmax_backward
 
 version 1 is a straight-forward port from CPU code to kernel, parallel over B,T
 ./crossentropy_softmax_backward 1
diff --git a/dev/cuda/encoder_backward.cu b/dev/cuda/encoder_backward.cu
index 8c96eaf46..53221878e 100644
--- a/dev/cuda/encoder_backward.cu
+++ b/dev/cuda/encoder_backward.cu
@@ -2,7 +2,7 @@
 Kernels for the positional encoder forward pass in GPT-2.
 
 Compile example:
-nvcc -O3 --use_fast_math encoder_backward.cu -o encoder_backward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt encoder_backward.cu -o encoder_backward
 
 version 1 is naive port from CPU code to kernel
 parallelizes over B,T,C, uses atomics to add to dwte, dwpe
diff --git a/dev/cuda/encoder_forward.cu b/dev/cuda/encoder_forward.cu
index 16df62f34..39d5f0fa3 100644
--- a/dev/cuda/encoder_forward.cu
+++ b/dev/cuda/encoder_forward.cu
@@ -2,7 +2,7 @@
 Kernels for the positional encoder forward pass in GPT-2.
 
 Compile example:
-nvcc -O3 --use_fast_math encoder_forward.cu -o encoder_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt encoder_forward.cu -o encoder_forward
 
 version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C
 ./encoder_forward 1
@@ -17,24 +17,10 @@ version 3 is like version 2 but uses float4 reads/writes
 #include <stdio.h>
 #include <stdlib.h>
 #include <cuda_runtime.h>
-#include "common.h"
 #include <cassert>
 
-// turn on bf16 as default, done up here for now
 #define ENABLE_BF16
-
-#if defined(ENABLE_BF16)
-typedef __nv_bfloat16 floatX;
-typedef __nv_bfloat16 floatN;
-#elif defined(ENABLE_FP16)
-typedef half floatX;
-typedef half floatN;
-#else
-typedef float floatX;
-typedef float floatN;
-#endif
-
-typedef Packed128<floatX> x128;
+#include "common.h"
 
 // ----------------------------------------------------------------------------
 // CPU code reference
diff --git a/dev/cuda/fused_residual_forward.cu b/dev/cuda/fused_residual_forward.cu
new file mode 100644
index 000000000..b98a67c4b
--- /dev/null
+++ b/dev/cuda/fused_residual_forward.cu
@@ -0,0 +1,695 @@
+/*
+Kernels for residual forward pass fused with layernorm
+
+Compile example:
+nvcc -O3 --use_fast_math -lcublas -lcublasLt fused_residual_forward.cu -o fused_residual_forward
+
+version 1 is naive port from CPU code to kernel
+./fused_residual_forward 1
+version 2 packs input into 128 bit memory reads
+./fused_residual_forward 2
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "assert.h"
+#include <cuda_runtime.h>
+
+#define ENABLE_BF16
+#include "common.h"
+
+// ----------------------------------------------------------------------------
+// CPU code reference lol
+
+void residual_forward_cpu(float* out, const float* inp1, const float* inp2, int N) {
+    for (int i = 0; i < N; i++) {
+        out[i] = inp1[i] + inp2[i];
+    }
+}
+
+void layernorm_forward_cpu(float* out, float* mean, float* rstd,
+                           const float* inp, const float* weight, const float* bias,
+                           int B, int T, int C) {
+    float eps = 1e-5f;
+    for (int b = 0; b < B; b++) {
+        for (int t = 0; t < T; t++) {
+            // seek to the input position inp[b,t,:]
+            const float* x = inp + b * T * C + t * C;
+            // calculate the mean
+            float m = 0.0f;
+            for (int i = 0; i < C; i++) {
+                m += x[i];
+            }
+            m = m/C;
+            // calculate the variance (without any bias correction)
+            float v = 0.0f;
+            for (int i = 0; i < C; i++) {
+                float xshift = x[i] - m;
+                v += xshift * xshift;
+            }
+            v = v/C;
+            // calculate the rstd
+            float s = 1.0f / sqrtf(v + eps);
+            // seek to the output position in out[b,t,:]
+            float* out_bt = out + b * T * C + t * C;
+            for (int i = 0; i < C; i++) {
+                float n = (s * (x[i] - m)); // normalized output
+                float o = n * weight[i] + bias[i]; // scale and shift it
+                out_bt[i] = o; // write
+            }
+            // cache the mean and rstd for the backward pass later
+            mean[b * T + t] = m;
+            rstd[b * T + t] = s;
+        }
+    }
+}
+
+// ----------------------------------------------------------------------------
+// GPU kernels
+
+// elementwise ops are nice and ez
+__global__ void residual_forward_kernel1(floatX* out, const floatX* inp1, const floatX* inp2, int N) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < N) {
+        out[idx] = (floatX)((float)inp1[idx] + (float)inp2[idx]);
+    }
+}
+
+// naive drag and drop implementation into kernel, parallelize over B,T, loop over C
+__global__ void layernorm_forward_kernel1(floatX* out, floatX* mean, floatX* rstd,
+                                          const floatX* inp, const floatX* weight, const floatX* bias,
+                                          int N, int C) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    float eps = 1e-5f;
+
+    if (idx < N) {
+        // seek to the input position inp[idx,:]
+        const floatX* x = inp + idx * C;
+        // calculate the mean
+        float m = 0.0f;
+        for (int i = 0; i < C; i++) {
+            m += (float)x[i];
+        }
+        m = m / C;
+        // calculate the variance (without any bias correction)
+        float v = 0.0f;
+        for (int i = 0; i < C; i++) {
+            float xshift = (float)x[i] - m;
+            v += xshift * xshift;
+        }
+        v = v / C;
+        // calculate the rstd
+        float s = 1.0f / sqrtf(v + eps);
+        // seek to the output position in out[idx,:]
+        floatX* out_idx = out + idx * C;
+        for (int i = 0; i < C; i++) {
+            float n = (s * ((float)x[i] - m)); // normalized output
+            float o = n * (float)weight[i] + (float)bias[i]; // scale and shift it
+            out_idx[i] = o; // write
+        }
+        // cache the mean and rstd for the backward pass later
+        mean[idx] = m;
+        rstd[idx] = s;
+    }
+}
+
+// naive fusion; uncoalesced access pattern leads to terrible performance
+__global__ void fused_residual_forward2(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                                        const floatX* inp1, const floatX* inp2,
+                                        const floatX* weight, const floatX* bias,
+                                        int N, int C) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if(idx > N) return;
+
+    // adjust pointers to current token
+    residual += C * idx;
+    normed += C * idx;
+    inp1 += C * idx;
+    inp2 += C * idx;
+
+    float eps = 1e-5f;
+
+    float m = 0.0f;
+    for(int c = 0; c < C; ++c) {
+        float out = (float)inp1[c] + (float)inp2[c];
+        m += out;
+        residual[c] = out;
+    }
+
+    m = m / C;
+    float v = 0.0f;
+    for (int c = 0; c < C; c++) {
+        float xshift = (float)residual[c] - m;
+        v += xshift * xshift;
+    }
+    v = v / C;
+
+    // calculate the rstd
+    float s = 1.0f / sqrtf(v + eps);
+    for (int c = 0; c < C; c++) {
+        float n = (s * ((float)residual[c] - m)); // normalized output
+        float o = n * (float)weight[c] + (float)bias[c]; // scale and shift it
+        normed[c] = o; // write
+    }
+    // cache the mean and rstd for the backward pass later
+    mean[idx] = m;
+    rstd[idx] = s;
+}
+
+// handle one token per warp for coalesced access
+__global__ void fused_residual_forward3(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                                        const floatX* inp1, const floatX* inp2,
+                                        const floatX* weight, const floatX* bias,
+                                        int N, int C) {
+    constexpr const int WarpSize = 32;
+    assert(blockDim.x == WarpSize);
+    int idx = blockIdx.x * blockDim.y + threadIdx.y;
+    if(idx > N) return;
+
+    // adjust pointers to current token
+    residual += C * idx;
+    normed += C * idx;
+    inp1 += C * idx;
+    inp2 += C * idx;
+
+    float eps = 1e-5f;
+    float m = 0.0f;
+    for(int c = threadIdx.x; c < C; c += WarpSize) {
+        float out = (float)inp1[c] + (float)inp2[c];
+        m += out;
+        residual[c] = out;
+    }
+
+    m = warpReduceSum(m);
+
+    m = m / C;
+    float v = 0.0f;
+    for(int c = threadIdx.x; c < C; c += WarpSize) {
+        float xshift = (float)residual[c] - m;
+        v += xshift * xshift;
+    }
+
+    v = warpReduceSum(v);
+    v = v / C;
+
+    // calculate the rstd
+    float s = 1.0f / sqrtf(v + eps);
+    for(int c = threadIdx.x; c < C; c += WarpSize) {
+        float n = (s * ((float)residual[c] - m)); // normalized output
+        float o = n * (float)weight[c] + (float)bias[c]; // scale and shift it
+        normed[c] = o; // write
+    }
+    // cache the mean and rstd for the backward pass later
+    if(threadIdx.x == 0) {
+        mean[idx] = m;
+        rstd[idx] = s;
+    }
+}
+
+// vectorized loading, single pass stats, streaming access and zigzag loop
+__global__ void fused_residual_forward_kernel4(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                                               const floatX* inp1, const floatX* inp2,
+                                               const floatX* weight, const floatX* bias,
+                                               int N, int C) {
+    using x128 = Packed128<floatX>;
+    constexpr const int WarpSize = 32;
+    assert(blockDim.x == WarpSize);
+    int idx = blockIdx.x * blockDim.y + threadIdx.y;
+    if(idx > N) return;
+
+    // adjust pointers to current token
+    residual += C * idx;
+    normed += C * idx;
+    inp1 += C * idx;
+    inp2 += C * idx;
+
+    const float eps = 1e-5f;
+    float sum = 0.0f;
+    float sum_sq = 0.0f;
+    int c = threadIdx.x * x128::size;
+    for(; c < C; c += WarpSize * x128::size) {
+        const x128 in1 = load128cs(inp1 + c);
+        const x128 in2 = load128cs(inp2 + c);
+        x128 out;
+        for(int k = 0; k < x128::size; ++k) {
+            out[k] = (float)in1[k] + (float)in2[k];
+            sum += (float)out[k];
+            sum_sq += (float)out[k] * (float)out[k];
+        }
+        store128(residual + c, out);
+    }
+
+    sum = warpReduceSum(sum);
+    sum_sq = warpReduceSum(sum_sq);
+
+    float m = sum / C;
+    float v = sum_sq / C - m * m;
+    float s = rsqrtf(v + eps);
+
+    c -= WarpSize * x128::size;
+    for(; c >= 0; c -= WarpSize * x128::size) {
+        const x128 res = load128cs(residual + c);
+        const x128 w = load128(weight + c);
+        const x128 b = load128(bias + c);
+        x128 out;
+        for(int k = 0; k < x128::size; ++k) {
+            float n = s * ((float)res[k] - m); // normalized output
+            float o = n * (float)w[k] + (float)b[k]; // scale and shift it
+            out[k] = o;
+        }
+
+        store128cs(normed + c, out);
+    }
+    // cache the mean and rstd for the backward pass later
+    if(threadIdx.x == 0) {
+        mean[idx] = m;
+        rstd[idx] = s;
+    }
+}
+
+// what do you want in shared memory? EVERYTHING!
+// thus, we no longer require zigzag loops and can do the numerically more stable variance estimation
+// needs special attention in the kernel launcher to ensure we have enough smem.
+__global__ void fused_residual_forward_kernel5(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                                               const floatX* inp1, const floatX* inp2,
+                                               const floatX* weight, const floatX* bias,
+                                               int N, int C) {
+    constexpr const int WarpSize = 32;
+    assert(blockDim.x == WarpSize);
+
+    // load weights and biases into shared memory
+    // do this before we allow any threads to exit!
+    extern __shared__ char params[];
+    // load128/store128 sometimes generated multiple instructions when the types here were floatX*, so
+    // let's keep everything as x128
+    x128* s_weight = reinterpret_cast<x128*>(params);
+    x128* s_bias = reinterpret_cast<x128*>(params) + (C / x128::size);
+    x128* s_res = reinterpret_cast<x128*>(params) + ((2 + threadIdx.y) * C / x128::size);
+
+    int sidx = (threadIdx.x + WarpSize * threadIdx.y) * x128::size;
+    for(int i = sidx; i < C; i += blockDim.y * WarpSize * x128::size) {
+        s_weight[i/x128::size] = load128(weight + i);
+        s_bias[i/x128::size] = load128(bias + i);
+    }
+    __syncthreads();
+
+    int idx = blockIdx.x * blockDim.y + threadIdx.y;
+    if(idx > N) return;
+
+    // adjust pointers to current token
+    residual += C * idx;
+    normed += C * idx;
+    inp1 += C * idx;
+    inp2 += C * idx;
+
+    const float eps = 1e-5f;
+    float sum = 0.0f;
+    for(int c = threadIdx.x * x128::size; c < C; c += WarpSize * x128::size) {
+        const x128 in1 = load128cs(inp1 + c);
+        const x128 in2 = load128cs(inp2 + c);
+        x128 out;
+        for(int k = 0; k < x128::size; ++k) {
+            out[k] = (float)in1[k] + (float)in2[k];
+            sum += (float)out[k];
+        }
+        store128cs(residual + c, out);
+        s_res[c / x128::size] = out;
+    }
+
+    sum = warpReduceSum(sum);
+    float m = sum / C;
+    float v = 0.f;
+
+    for(int c = threadIdx.x * x128::size; c < C; c += WarpSize * x128::size) {
+        const x128 res = s_res[c / x128::size];
+        for(int k = 0; k < x128::size; ++k) {
+            v += ((float)res[k] - m) * ((float)res[k] - m);
+        }
+    }
+
+    v = warpReduceSum(v) / C;
+    float s = rsqrtf(v + eps);
+
+    for(int c = threadIdx.x * x128::size; c < C; c += WarpSize * x128::size) {
+        const x128 res = s_res[c / x128::size];
+        const x128 w = s_weight[c / x128::size];
+        const x128 b = s_bias[c / x128::size];
+        x128 out;
+        for(int k = 0; k < x128::size; ++k) {
+            float n = s * ((float)res[k] - m); // normalized output
+            float o = n * (float)w[k] + (float)b[k]; // scale and shift it
+            out[k] = o;
+        }
+
+        store128cs(normed + c, out);
+    }
+    // cache the mean and rstd for the backward pass later
+    if(threadIdx.x == 0) {
+        mean[idx] = m;
+        rstd[idx] = s;
+    }
+}
+
+
+// using multiple warps per token, and keep threads persistent, so we never have to reload weights and biases
+// if we had one warp per token, though, this would require us to use a huge amount of shared memory. Therefore,
+// we use multiple warps per token; but generally we cannot use the entire block, because that would give too
+// little work per warp to be effective (each warp processes 256 bfloat16 elements, so for C=768 more than 3 warps
+// will just mean idle). Therefore, we add a z dimension, where warps with different z handle different tokens.
+// all this makes the launcher logic more complicated :(
+__global__ void fused_residual_forward_kernel6(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                                               const floatX* inp1, const floatX* inp2,
+                                               const floatX* weight, const floatX* bias,
+                                               int N, int C) {
+    constexpr const int WarpSize = 32;
+    assert(blockDim.x == WarpSize);
+
+    // load weights and biases into shared memory
+    // do this before we allow any threads to exit!
+    extern __shared__ char params[];
+    // load128/store128 sometimes generated multiple instructions when the types here were floatX*, so
+    // let's keep everything as x128
+    // weights and biases are  shared among all tokens
+    x128* s_weight = reinterpret_cast<x128*>(params);
+    x128* s_bias = reinterpret_cast<x128*>(params + C * sizeof(floatX));
+    // residual output (input to layernorm) is indpendent for each sub-block indicates by threadIdx.z
+    x128* s_res = reinterpret_cast<x128*>(params + (2 + threadIdx.z) * C * sizeof(floatX)  );
+    // similarly, each sub-block needs its own reduction buffers
+    float* s_mean = reinterpret_cast<float*>(params + (2 + blockDim.z) * C * sizeof(floatX) + threadIdx.z * 32 * sizeof(float));
+    float* s_var = reinterpret_cast<float*>(params + (2 + blockDim.z) * C * sizeof(floatX) + 32 * sizeof(float) * (blockDim.z + threadIdx.z));
+
+    int cidx = (threadIdx.x + WarpSize * threadIdx.y) * x128::size;
+    int step = blockDim.y * WarpSize * x128::size;
+
+    for(int c = cidx; c < C; c += step) {
+        s_weight[c / x128::size] = load128(weight + c);
+        s_bias[c / x128::size] = load128(bias + c);
+    }
+    // the block-level reductions will cause sync before the first time we read these
+    // => no syncthreads needed here
+
+
+    // loop over all tokens
+    for(int tidx = blockIdx.x * blockDim.z + threadIdx.z; tidx < N; tidx += gridDim.x * blockDim.z) {
+        // adjust pointers to current token
+        floatX* residual_bt = residual + C * tidx;
+        floatX* normed_bt = normed + C * tidx;
+        const floatX* inp1_bt = inp1 + C * tidx;
+        const floatX* inp2_bt = inp2 + C * tidx;
+
+        const float eps = 1e-5f;
+        float sum = 0.0f;
+        for (int c = cidx; c < C; c += step) {
+            const x128 in1 = load128cs(inp1_bt + c);
+            const x128 in2 = load128cs(inp2_bt + c);
+            x128 out;
+            for (int k = 0; k < x128::size; ++k) {
+                out[k] = (float) in1[k] + (float) in2[k];
+                sum += (float) out[k];
+            }
+            store128cs(residual_bt + c, out);
+            s_res[c / x128::size] = out;
+        }
+        sum = warpReduceSum(sum);
+        if(threadIdx.x == 0) {
+            s_mean[threadIdx.y] = sum;
+        }
+        __syncthreads();
+        float m = warpReduceSum(threadIdx.x < blockDim.y ? s_mean[threadIdx.x] : 0.f) / C;
+        // normally, we'd syncthread here to make sure that no warp is already at the next
+        // iteration of the loop, messing with s_mean. The fact that we interleave s_mean and s_var means
+        // we don't need these additional syncs.
+        float v = 0.f;
+
+        for (int c = cidx; c < C; c += step) {
+            const x128 res = s_res[c / x128::size];
+            for (int k = 0; k < x128::size; ++k) {
+                v += ((float) res[k] - m) * ((float) res[k] - m);
+            }
+        }
+
+        v = warpReduceSum(v);
+        if(threadIdx.x == 0) {
+            s_var[threadIdx.y] = v;
+        }
+        __syncthreads();
+        v = warpReduceSum(threadIdx.x < blockDim.y ? s_var[threadIdx.x] : 0.f) / C;
+        float s = rsqrtf(v + eps);
+
+        for (int c = cidx; c < C; c += step) {
+            const x128 res = s_res[c / x128::size];
+            const x128 w = s_weight[c / x128::size];
+            const x128 b = s_bias[c / x128::size];
+            x128 out;
+            for (int k = 0; k < x128::size; ++k) {
+                float n = s * ((float) res[k] - m); // normalized output
+                float o = n * (float) w[k] + (float) b[k]; // scale and shift it
+                out[k] = o;
+            }
+
+            store128(normed_bt + c, out);
+        }
+        // cache the mean and rstd for the backward pass later
+        if (threadIdx.x == 0 && threadIdx.y == 0) {
+            mean[tidx] = m;
+            rstd[tidx] = s;
+        }
+    }
+}
+
+
+
+// ----------------------------------------------------------------------------
+// kernel launcher
+
+void fused_residual_forward1(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                             const floatX* inp1, const floatX* inp2,
+                             const floatX* weight, const floatX* bias,
+                             int N, int C, const int block_size) {
+    const int grid_size_resid = ceil_div(N * C, block_size);
+    residual_forward_kernel1<<<grid_size_resid, block_size>>>(residual, inp1, inp2, N*C);
+    cudaCheck(cudaGetLastError());
+    const int grid_size_ln = ceil_div(N, block_size);
+    layernorm_forward_kernel1<<<grid_size_ln, block_size>>>(normed, mean, rstd, residual, weight, bias, N, C);
+    cudaCheck(cudaGetLastError());
+}
+
+void fused_residual_forward2(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                             const floatX* inp1, const floatX* inp2,
+                             const floatX* weight, const floatX* bias,
+                             int N, int C, const int block_size) {
+    const int grid_size = ceil_div(N, (int)(block_size));
+    fused_residual_forward2<<<grid_size, block_size>>>(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C);
+    cudaCheck(cudaGetLastError());
+}
+
+void fused_residual_forward3(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                             const floatX* inp1, const floatX* inp2,
+                             const floatX* weight, const floatX* bias,
+                             int N, int C, const int block_size) {
+    int block_y = block_size / 32;
+    const int grid_size = ceil_div(N, block_y);
+    fused_residual_forward3<<<grid_size, dim3(32, block_y)>>>(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C);
+    cudaCheck(cudaGetLastError());
+}
+
+void fused_residual_forward4(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                             const floatX* inp1, const floatX* inp2,
+                             const floatX* weight, const floatX* bias,
+                             int N, int C, const int block_size) {
+    int block_y = block_size / 32;
+    const int grid_size = ceil_div(N, block_y);
+    fused_residual_forward_kernel4<<<grid_size, dim3(32, block_y)>>>(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C);
+    cudaCheck(cudaGetLastError());
+}
+
+void fused_residual_forward5(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                             const floatX* inp1, const floatX* inp2,
+                             const floatX* weight, const floatX* bias,
+                             int N, int C, const int block_size) {
+    int block_y = block_size / 32;
+    const int grid_size = ceil_div(N, block_y);
+    size_t smem = (2 + block_y) * C * sizeof(floatX);
+
+    // in order to use more than 48 KiB of smem, need to call cudaFuncSetAttribute
+    // this may fail, in which case we fall back to the smem free implementation.
+    cudaCheck(cudaGetLastError());
+    auto status = cudaFuncSetAttribute(fused_residual_forward_kernel5, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
+    cudaGetLastError();
+    if(status == cudaSuccess) {
+        fused_residual_forward_kernel5<<<grid_size, dim3(32, block_y), smem>>>(residual, normed, mean, rstd, inp1, inp2,
+                                                                               weight, bias, N, C);
+    } else {
+        fused_residual_forward_kernel4<<<grid_size, dim3(32, block_y)>>>(residual, normed, mean, rstd, inp1, inp2,
+                                                                         weight, bias, N, C);
+    }
+    cudaCheck(cudaGetLastError());
+}
+
+void fused_residual_forward6(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                             const floatX* inp1, const floatX* inp2,
+                             const floatX* weight, const floatX* bias,
+                             int N, int C, const int block_size) {
+    int warps_per_token = max(1, C / Packed128<floatX>::size / 32);
+    int total_warps = block_size / 32;
+    int block_z = max(1, total_warps / warps_per_token);
+    int block_y = max(1, total_warps / block_z);
+    size_t smem = (2 + block_z) * C * sizeof(floatX) + 64 * sizeof(float) * block_z;
+
+    // in order to use more than 48 KiB of smem, need to call cudaFuncSetAttribute
+    // this may fail, in which case we fall back to the smem free implementation.
+    cudaCheck(cudaGetLastError());
+    auto status = cudaFuncSetAttribute(fused_residual_forward_kernel6, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
+    cudaGetLastError();
+    if(status == cudaSuccess) {
+        const int num_blocks = max(1, cuda_threads_per_SM * cuda_num_SMs / block_size);
+        fused_residual_forward_kernel6<<<num_blocks, dim3(32, block_y, block_z), smem>>>(residual, normed, mean, rstd, inp1, inp2,
+                                                                               weight, bias, N, C);
+    } else {
+        const int grid_size = ceil_div(N, total_warps);
+        fused_residual_forward_kernel4<<<grid_size, dim3(32, total_warps)>>>(residual, normed, mean, rstd, inp1, inp2,
+                                                                         weight, bias, N, C);
+    }
+    cudaCheck(cudaGetLastError());
+}
+
+// kernel version dispatch
+void fused_residual_forward(int kernel_num, floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                            const floatX* inp1, const floatX* inp2,
+                            const floatX* weight, const floatX* bias,
+                            int N, int C, const int block_size) {
+    switch (kernel_num) {
+        case 1:
+            fused_residual_forward1(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C, block_size);
+            break;
+        case 2:
+            fused_residual_forward2(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C, block_size);
+            break;
+        case 3:
+            fused_residual_forward3(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C, block_size);
+            break;
+        case 4:
+            fused_residual_forward4(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C, block_size);
+            break;
+        case 5:
+            fused_residual_forward5(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C, block_size);
+            break;
+        case 6:
+            fused_residual_forward6(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C, block_size);
+            break;
+        default:
+            printf("Invalid kernel number\n");
+            exit(1);
+    }
+}
+
+// ----------------------------------------------------------------------------
+
+int main(int argc, const char **argv) {
+    setup_main();
+
+    int B = 8;
+    int T = 1024;
+    int C = 768;
+
+    // read kernel_num from command line
+    int kernel_num = 1;
+    if (argc > 1) {
+        kernel_num = atoi(argv[1]);
+    }
+    printf("Using kernel %d\n", kernel_num);
+
+    // create host memory of random numbers
+    float* residual = (float*)malloc(B * T * C * sizeof(float));
+    float* normed = (float*)malloc(B * T * C * sizeof(float));
+    float* inp1 = make_random_float(B * T * C);
+    float* inp2 = make_random_float(B * T * C);
+    float* mean = (float*)malloc(B * T * sizeof(float));
+    float* rstd = (float*)malloc(B * T * sizeof(float));
+    float* weight = make_random_float(C);
+    float* bias = make_random_float(C);
+    
+    // move to GPU
+    floatX* d_residual;
+    floatX* d_normed;
+    floatX* d_inp1;
+    floatX* d_inp2;
+    floatX* d_mean;
+    floatX* d_rstd;
+    floatX* d_weight;
+    floatX* d_bias;
+    cudaCheck(cudaMalloc(&d_residual, B * T * C * sizeof(floatX)));
+    cudaCheck(cudaMalloc(&d_normed, B * T * C * sizeof(floatX)));
+    cudaCheck(cudaMalloc(&d_inp1, B * T * C * sizeof(floatX)));
+    cudaCheck(cudaMalloc(&d_inp2, B * T * C * sizeof(floatX)));
+    cudaCheck(cudaMalloc(&d_mean, B * T * sizeof(float)));
+    cudaCheck(cudaMalloc(&d_rstd, B * T * sizeof(float)));
+    cudaCheck(cudaMalloc(&d_weight, C * sizeof(float)));
+    cudaCheck(cudaMalloc(&d_bias, C * sizeof(float)));
+    cudaCheck(memcpy_convert(d_inp1, inp1, B * T * C));
+    cudaCheck(memcpy_convert(d_inp2, inp2, B * T * C));
+    cudaCheck(memcpy_convert(d_weight, weight, C));
+    cudaCheck(memcpy_convert(d_bias, bias, C));
+
+    // first check the correctness of the kernel
+    residual_forward_cpu(residual, inp1, inp2, B * T * C);
+    layernorm_forward_cpu(normed, mean, rstd, residual, weight, bias, B, T, C);
+
+    // time the kernel at different block sizes
+    int block_sizes[] = {32, 64, 128, 256, 512, 1024};
+
+    for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
+        int block_size = block_sizes[j];
+        printf("Checking block size %d.\n", block_size);
+        cudaCheck(cudaMemset(d_residual, 0, B * T * C * sizeof(floatX)));
+        fused_residual_forward(kernel_num, d_residual, d_normed, d_mean, d_rstd, d_inp1, d_inp2, d_weight, d_bias,
+                               B*T, C, block_size);
+        float tol = std::is_same_v<floatX, float> ? 1e-5 : 5e-2;
+        validate_result(d_residual, residual, "residual", B * T * C, tol);
+        validate_result(d_mean, mean, "mean", B * T, tol);
+        validate_result(d_rstd, rstd, "rstd", B * T, tol);
+        validate_result(d_normed, normed, "normed", B * T * C, tol);
+    }
+
+    printf("All results match. Starting benchmarks.\n\n");
+
+    for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
+        int block_size = block_sizes[j];
+
+        int repeat_times = 1000;
+        float elapsed_time = benchmark_kernel(repeat_times, fused_residual_forward, kernel_num,
+                                              d_residual, d_normed, d_mean, d_rstd, d_inp1, d_inp2, d_weight, d_bias,
+                                              B*T, C, block_size
+                                              );
+
+        // napkin math: estimate the memory bandwidth achieved
+        // for each (B,T,C) output element, we do 2 reads and 2 writes, plus 2 BT writes for mean/rstd
+        // and e.g. A100 40GB PCIe is advertised at 1,555GB/s
+        long memory_ops = B * T * (C * 4 + 2) * sizeof(floatX);
+        float memory_bandwidth = memory_ops / elapsed_time / 1e6;
+        float toks_per_msec = B * T / elapsed_time / 1e3;
+
+        printf("block_size %4d | time %.4f ms | bandwidth %.2f GB/s | elements: %.2f ktok/ms\n",
+               block_size, elapsed_time, memory_bandwidth, toks_per_msec);
+    }
+
+    // free memory
+    free(residual);
+    free(normed);
+    free(mean);
+    free(rstd);
+    free(weight);
+    free(bias);
+    free(inp1);
+    free(inp2);
+    cudaCheck(cudaFree(d_residual));
+    cudaCheck(cudaFree(d_normed));
+    cudaCheck(cudaFree(d_mean));
+    cudaCheck(cudaFree(d_rstd));
+    cudaCheck(cudaFree(d_weight));
+    cudaCheck(cudaFree(d_bias));
+    cudaCheck(cudaFree(d_inp1));
+    cudaCheck(cudaFree(d_inp2));
+
+    return 0;
+}
diff --git a/dev/cuda/gelu_backward.cu b/dev/cuda/gelu_backward.cu
index 8c64d7ca3..3d12dd864 100644
--- a/dev/cuda/gelu_backward.cu
+++ b/dev/cuda/gelu_backward.cu
@@ -2,7 +2,7 @@
 Kernels for gelu backward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math gelu_backward.cu -o gelu_backward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt gelu_backward.cu -o gelu_backward
 
 If encountering "error: identifier "M_PI" is undefined", add the following lines to the top of the file:
 
@@ -19,23 +19,9 @@ version 2 uses the Packed128 data structure
 #include <stdio.h>
 #include <stdlib.h>
 #include <cuda_runtime.h>
-#include "common.h"
 
-// turn on bf16 as default, done up here for now
 #define ENABLE_BF16
-
-#if defined(ENABLE_BF16)
-typedef __nv_bfloat16 floatX;
-typedef __nv_bfloat16 floatN;
-#elif defined(ENABLE_FP16)
-typedef half floatX;
-typedef half floatN;
-#else
-typedef float floatX;
-typedef float floatN;
-#endif
-
-typedef Packed128<floatX> x128;
+#include "common.h"
 
 // ----------------------------------------------------------------------------
 // CPU code reference
diff --git a/dev/cuda/gelu_forward.cu b/dev/cuda/gelu_forward.cu
index 27aa9d598..01abfe2b5 100644
--- a/dev/cuda/gelu_forward.cu
+++ b/dev/cuda/gelu_forward.cu
@@ -2,7 +2,7 @@
 Kernels for gelu forward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math gelu_forward.cu -o gelu_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt gelu_forward.cu -o gelu_forward
 
 If encountering "error: identifier "M_PI" is undefined", add the following lines to the top of the file:
 
@@ -19,23 +19,9 @@ version 2 is bfloat16 with the Packed128 data structure
 #include <stdio.h>
 #include <stdlib.h>
 #include <cuda_runtime.h>
-#include "common.h"
 
-// turn on bf16 as default, done up here for now
 #define ENABLE_BF16
-
-#if defined(ENABLE_BF16)
-typedef __nv_bfloat16 floatX;
-typedef __nv_bfloat16 floatN;
-#elif defined(ENABLE_FP16)
-typedef half floatX;
-typedef half floatN;
-#else
-typedef float floatX;
-typedef float floatN;
-#endif
-
-typedef Packed128<floatX> x128;
+#include "common.h"
 
 // ----------------------------------------------------------------------------
 // CPU code reference
diff --git a/dev/cuda/global_norm.cu b/dev/cuda/global_norm.cu
new file mode 100644
index 000000000..6c2ed0389
--- /dev/null
+++ b/dev/cuda/global_norm.cu
@@ -0,0 +1,184 @@
+/*
+Kernels for a global norm.
+Global norm in this context means that we want to calculate a single norm cooperatively using all avalailable SMs, instead
+ of multiple norms that can be handled by separate blocks.
+
+Compile example:
+nvcc -O3 --use_fast_math global_norm.cu -o global_norm
+*/
+
+
+#include <assert.h>
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+
+// turn on bf16 as default, done up here for now
+#define ENABLE_BF16
+#include "common.h"
+
+
+float global_norm_cpu(const float* data, size_t count) {
+    // accumulate in double so we have an accurate numerical reference
+    double acc = 0.0;
+    for(size_t i = 0; i < count; ++i) {
+        acc  += (double)data[i] * (double)data[i];
+    }
+    return (float)acc;
+}
+
+
+template<class T>
+__global__ void norm_kernel1(float* out, const T* data, size_t count) {
+    // we want as few atomics as possible, so each block tries to do
+    // the maximum amount of work (so no fixed chunk, but instead iterating
+    // until we run out of data), and then we reduce inside the block
+    // and finally have just one atomic per block.
+    namespace cg = cooperative_groups;
+    cg::thread_block block = cg::this_thread_block();
+    cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
+
+    __shared__ float block_result[32];
+
+    // out will be updated atomically from all thread blocks
+    size_t index = threadIdx.x + blockDim.x * blockIdx.x;
+    size_t grid_width = blockDim.x * gridDim.x;
+    float accumulator = 0.f;
+    for(size_t i = index; i < count; i += grid_width) {
+        accumulator += (float)data[i] * (float)data[i];
+    }
+    // warp-level reduce
+    float warp_result = cg::reduce(warp, accumulator, cg::plus<float>{});
+    block_result[warp.meta_group_rank()] = warp_result;
+    block.sync();
+    if(warp.meta_group_rank() == 0) {
+        float gather = warp.thread_rank() < warp.meta_group_size() ? block_result[warp.thread_rank()] : 0.f;
+        float block_sum = cg::reduce(warp, gather, cg::plus<float>{});
+        if(warp.thread_rank() ==  0) {
+            atomicAdd(out, block_sum);
+        }
+    }
+}
+
+template<class T>
+__global__ void norm_kernel2(float* out, const T* data, size_t count) {
+    // concrete example for an A100 GPU (108 SMs, 2048 max threads each)
+    // so there are 2048 * 108 = 221,184 threads total
+    // say the block_size is 512, then we would launch 432 blocks in total
+    // say num_params is ~100M, each thread will process ~500 elements
+    // warps reduce with warp-level reduce, we have 221,184/32 = 6,912 warps
+    // and then each warp atomicAdd's to global memory, total of 6,912 atomics
+
+    // no shared memory; but one atomic per warp instead of per block
+    namespace cg = cooperative_groups;
+    cg::thread_block block = cg::this_thread_block();
+    cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
+
+    // out will be updated atomically from all thread blocks
+    size_t index = threadIdx.x + blockDim.x * blockIdx.x;
+    size_t grid_width = blockDim.x * gridDim.x;
+    float accumulator = 0.f;
+    for(size_t i = index; i < count; i += grid_width) {
+        accumulator += (float)data[i] * (float)data[i];
+    }
+
+    // warp-level reduce
+    float warp_result = cg::reduce(warp, accumulator, cg::plus<float>{});
+    // and atomic in global buffer
+    if(warp.thread_rank() == 0) {
+        atomicAdd(out, warp_result);
+    }
+}
+
+template<typename T>
+void global_norm1(float* out, const T* values, size_t count, int block_size) {
+    // launch just enough blocks to fill the grid. deliberately no DIV_CEIL.
+    // having one block less than possible is a tiny performance hit, having
+    // one block too many is catastrophic, since it only can start once all the other
+    // blocks finish. anyway, I think cuda_threads_per_SM should be a multiple of 512
+    // on all gpus, so the division really is going to be exact.
+    const int grid_size = cuda_threads_per_SM * cuda_num_SMs / block_size;
+    assert(grid_size > 0);      // gives a better error than letting the call below fail
+    norm_kernel1<<<grid_size, block_size>>>(out, values, count);
+    cudaCheck(cudaGetLastError());
+}
+
+template<typename T>
+void global_norm2(float* out, const T* values, size_t count, int block_size) {
+    // ditto
+    const int grid_size = cuda_threads_per_SM * cuda_num_SMs / block_size;
+    assert(grid_size > 0);      // gives a better error than letting the call below fail
+    norm_kernel2<<<grid_size, block_size>>>(out, values, count);
+    cudaCheck(cudaGetLastError());
+}
+
+void global_norm(int kernel_num, float* out, const floatX* values, size_t count, int block_size) {
+    switch (kernel_num) {
+        case 1:
+            return global_norm1(out, values, count, block_size);
+        case 2:
+            return global_norm2(out, values, count, block_size);
+    }
+}
+
+int main(int argc, const char **argv) {
+    setup_main();
+
+    int C = 768;
+    int L = 12;
+
+    size_t num_params = (size_t)(C * 4*C + C*C) * 2 * L;
+
+    // create host memory of random numbers
+    float* inp = make_random_float(num_params);
+    // scale them down
+    for(size_t i = 0; i < num_params; ++i) {
+        inp[i] *= 1e-3;
+    }
+
+    // read kernel_num from command line
+    int kernel_num = 1;
+    if (argc > 1) {
+        kernel_num = atoi(argv[1]);
+    }
+    printf("Using kernel %d\n", kernel_num);
+
+    // first check the correctness of the kernel
+    float out = global_norm_cpu(inp, num_params);
+
+    // move to GPU
+    float* d_out;
+    floatX* d_inp;
+    cudaCheck(cudaMalloc(&d_out,  sizeof(float)));
+    cudaCheck(cudaMalloc(&d_inp, num_params * sizeof(floatX)));
+    cudaCheck(memcpy_convert(d_inp, inp, num_params));
+
+    int block_sizes[] = {32, 64, 128, 256, 512, 768, 1024};
+    for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
+        int block_size = block_sizes[j];
+        printf("Checking block size %d.\n", block_size);
+        cudaCheck(cudaMemset(d_out, 0, sizeof(float)));
+        global_norm(kernel_num, d_out, d_inp, num_params, block_size);
+        validate_result(d_out, &out, "out", 1, 1e-2f);
+    }
+
+    printf("All results match. Starting benchmarks.\n\n");
+
+    for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
+        int block_size = block_sizes[j];
+
+        int repeat_times = 1000;
+
+        float elapsed_time = benchmark_kernel(repeat_times, global_norm,
+                                              kernel_num, d_out, d_inp,
+                                              num_params, block_size);
+        size_t memory_ops = num_params * sizeof(floatX);
+        float memory_bandwidth = memory_ops / elapsed_time / 1e6;
+
+        printf("block_size %4d | time %.4f ms | bandwidth %.2f GB/s\n", block_size, elapsed_time, memory_bandwidth);
+    }
+
+    // free memory
+    free(inp);
+    cudaCheck(cudaFree(d_out));
+    cudaCheck(cudaFree(d_inp));
+}
\ No newline at end of file
diff --git a/dev/cuda/layernorm_backward.cu b/dev/cuda/layernorm_backward.cu
index c1f01b0e6..d9502880b 100644
--- a/dev/cuda/layernorm_backward.cu
+++ b/dev/cuda/layernorm_backward.cu
@@ -2,7 +2,7 @@
 Kernels for layernorm backward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math layernorm_backward.cu -o layernorm_backward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt layernorm_backward.cu -o layernorm_backward
 
 version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C
 ./layernorm_backward 1
@@ -17,21 +17,9 @@ version 2 moves a lot of reduction to shared memory over global memory
 #include <assert.h>
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
-#include "common.h"
 
-// turn on bf16 as default, done up here for now
 #define ENABLE_BF16
-
-#if defined(ENABLE_BF16)
-typedef __nv_bfloat16 floatX;
-typedef __nv_bfloat16 floatN;
-#elif defined(ENABLE_FP16)
-typedef half floatX;
-typedef half floatN;
-#else
-typedef float floatX;
-typedef float floatN;
-#endif
+#include "common.h"
 
 // ----------------------------------------------------------------------------
 // CPU code reference
@@ -125,13 +113,6 @@ void layernorm_backward_cpu(float* dinp, float* dweight, float* dbias,
 // GPU kernels
 
 // GPU helper functions for atomicAdd on smaller than 32-bit types
-__device__ floatX warpReduceSum(floatX val) {
-    for (int offset = 16; offset > 0; offset /= 2) {
-        val += __shfl_xor_sync(0xFFFFFFFF, val, offset);
-    }
-    return val;
-}
-
 #ifdef ENABLE_BF16
 __device__ void atomicAddX(__nv_bfloat16* addr, __nv_bfloat16 val) {
     uintptr_t ptr_val = reinterpret_cast<uintptr_t>(addr);
@@ -751,6 +732,309 @@ __global__ void layernorm_backward_kernel7(floatX* dinp, floatX* dweight, floatX
     }
 }
 
+__global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS)
+                layernorm_backward_kernel8(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
+                                            const floatX* dout, const floatX* inp, const floatX* weight,
+                                            const floatX* mean, const floatX* rstd,
+                                            int B, int T, int C) {
+    extern __shared__ float shared[]; // size = 2 * C + 1
+    int warpId = threadIdx.x / warpSize; // warp index within a block
+    int warpsInBlock = blockDim.x / warpSize; //number of warps in block
+    int baseIdx = blockIdx.x * warpsInBlock + warpId;
+    int warpThreadIdx = threadIdx.x % warpSize; // Thread index within the warp
+    int warpsInGrid = gridDim.x * warpsInBlock;
+    int C_per_iteration = warpSize * x128::size;
+    int iterations_C = C / C_per_iteration;
+
+    // the first half of shared memory is bias, second is weight
+    float* dbias_shared = shared;
+    float* dweight_shared = shared + C;
+
+    // init shared memory to zero
+    for(int i = threadIdx.x; i < C; i+= blockDim.x){
+       dbias_shared[i] = 0.0f;
+       dweight_shared[i] = 0.0f;
+    }
+    unsigned int *tmp_flag = (unsigned int*)(shared + C*2);
+    __syncthreads();
+
+    for (int idx = baseIdx; idx < B * T; idx += warpsInGrid) {
+        int b = idx / T;
+        int t = idx % T;
+
+        const floatX* dout_bt = dout + b * T * C + t * C;
+        const floatX* inp_bt = inp + b * T * C + t * C;
+        floatX* dinp_bt = dinp + b * T * C + t * C;
+        const float mean_bt = (float)mean[b * T + t];
+        const float rstd_bt = (float)rstd[b * T + t];
+
+        // first: two reduce operations
+        float dnorm_mean = 0.0f;
+        float dnorm_norm_mean = 0.0f;
+        for (int i = warpThreadIdx * x128::size; i < C; i += warpSize * x128::size) {
+            x128 dout128_i   = load128(dout_bt + i);
+            x128 inp128_i    = load128(inp_bt  + i);
+            x128 weight128_i = load128(weight  + i);
+            for (int k = 0; k < x128::size; k++) {
+                float norm_bti = ((float)inp128_i[k] - mean_bt) * rstd_bt;
+                float dnorm_i = (float)weight128_i[k] * (float)dout128_i[k];
+                dnorm_mean += dnorm_i;
+                dnorm_norm_mean += dnorm_i * norm_bti;
+            }
+        }
+        dnorm_mean = warpReduceSum(dnorm_mean) / C;
+        dnorm_norm_mean = warpReduceSum(dnorm_norm_mean) / C;
+
+        // now iterate again and accumulate all the gradients
+        // unfortunately we cannot use the same index for x128 arrays and shared memory
+        // as atomics can only be 32-bit rather than 128-bit (at least pre-SM90/Hopper)
+        // so this would result in an 8-way bank conflict, and kill performance
+        // so instead, we use a shared memory friendly index, and reorder before the final write
+        for (int i = 0; i < iterations_C; i++) {
+            int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration);
+            int shared_index = warpThreadIdx + (i * C_per_iteration);
+            x128 dout128   = load128cs(dout_bt + global_index);
+            x128 inp128    = load128cs(inp_bt  + global_index);
+            x128 dinp128   = load128(dinp_bt   + global_index);
+            x128 weight128 = load128(weight    + global_index);
+
+            for (int x = 0; x < x128::size; x++) {
+                float dout_i = (float)dout128[x];
+                float norm_bti = ((float)inp128[x] - mean_bt) * rstd_bt;
+                float dnorm_i = (float)weight128[x] * dout_i;
+                // gradient contribution to bias (using shared memory friendly index)
+                atomicAdd(&dbias_shared[shared_index + x*warpSize], dout_i);
+                // gradient contribution to weight (using shared memory friendly index)
+                atomicAdd(&dweight_shared[shared_index + x*warpSize], norm_bti * dout_i);
+                // gradient contribution to input
+                float dval = 0.0f;
+                dval += dnorm_i; // term 1
+                dval -= dnorm_mean; // term 2
+                dval -= norm_bti * dnorm_norm_mean; // term 3
+                dval *= rstd_bt; // final scale
+                dinp128[x] = (floatX)((float)dinp128[x] + dval);
+            }
+            // cache in L2 as this is read by the next kernel, but bypass L1 to minimise thrashing
+            store128cg(dinp_bt + global_index, dinp128);
+        }
+    }
+    // Accumulate into a FP32 scratchpad
+    // BF16 atomics are potentially much slower... and this is more precise!
+    // todo - could potentially avoid the extra copy if floatX is FP32, fairly negligible though
+    __syncthreads();
+    float* scratch_dbias = scratch;
+    float* scratch_dweight = scratch + C;
+    unsigned int* scratchFlag = (unsigned int*)(scratch + (2 * C));
+    for(int i = threadIdx.x; i < C; i+= blockDim.x) {
+        // global atomics in the same "shared memory banking friendly" order
+        atomicAdd(&scratch_dbias[i], dbias_shared[i]);
+        atomicAdd(&scratch_dweight[i], dweight_shared[i]);
+    }
+    __syncthreads();
+    if (threadIdx.x == 0) {
+        *tmp_flag = atomicInc(scratchFlag, gridDim.x);
+    }
+    __syncthreads();
+    if (*tmp_flag == gridDim.x-1) {
+        for (int i = warpId; i < iterations_C; i += warpsInBlock) {
+            // reorder from atomic/shared memory-friendly index to real global memory index
+            // and convert from float/FP32 to floatX/BF16 for the final write
+            int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration);
+            int shared_index = warpThreadIdx + (i * C_per_iteration);
+
+            x128 dbias128 = load128(dbias + global_index);
+            x128 dweight128 = load128(dweight + global_index);
+            for (int x = 0; x < x128::size; x++) {
+                float s_db = scratch_dbias[shared_index + x*warpSize];
+                float s_dw = scratch_dweight[shared_index + x*warpSize];
+                dbias128[x] = (floatX)(s_db + (float)dbias128[x]);
+                dweight128[x] = (floatX)(s_dw + (float)dweight128[x]);
+            }
+            store128(dbias + global_index, dbias128);
+            store128(dweight + global_index, dweight128);
+        }
+    }
+}
+
+__global__ void layernorm_backward_kernel9(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
+                                            const floatX* dout, const floatX* inp, const floatX* weight,
+                                            const floatX* mean, const floatX* rstd,
+                                            int B, int T, int C) {
+    constexpr int WARP_SIZE = 32;
+    int BLOCK_SIZE = blockDim.x;
+    int warpsInBlock = BLOCK_SIZE / WARP_SIZE; //number of warps in block
+    extern __shared__ float shared[]; // size = 2 * C + 1
+
+    int warpId = threadIdx.x / WARP_SIZE; // warp index within a block
+    int baseIdx = blockIdx.x * warpsInBlock + warpId;
+    int warpThreadIdx = threadIdx.x % WARP_SIZE; // Thread index within the warp
+    int warpsInGrid = gridDim.x * warpsInBlock;
+    int C_per_iteration = WARP_SIZE * x128::size;
+    int iterations_C = ceil_div(C, C_per_iteration) + 2;
+
+    // the first half of shared memory is bias, second is weight
+    float* dbias_shared = shared;
+    float* dweight_shared = shared + C;
+    float* dbias_tmp_shared = shared + 2 * C;
+    float* dweight_tmp_shared = shared + 2 * C + BLOCK_SIZE;
+
+    // init shared memory to zero
+    for(int i = threadIdx.x; i < C; i+= BLOCK_SIZE){
+       dbias_shared[i] = 0.0f;
+       dweight_shared[i] = 0.0f;
+    }
+    unsigned int *tmp_flag = (unsigned int*)(shared + 2*C + 2*BLOCK_SIZE);
+    __syncthreads();
+
+    for (int idx = baseIdx; idx < B * T; idx += warpsInGrid) {
+        int b = idx / T;
+        int t = idx % T;
+
+        const floatX* dout_bt = dout + b * T * C + t * C;
+        const floatX* inp_bt = inp + b * T * C + t * C;
+        floatX* dinp_bt = dinp + b * T * C + t * C;
+        const float mean_bt = (float)mean[b * T + t];
+        const float rstd_bt = (float)rstd[b * T + t];
+
+        // first: two reduce operations
+        float dnorm_mean = 0.0f;
+        float dnorm_norm_mean = 0.0f;
+        for (int i = warpThreadIdx * x128::size; i < C; i += WARP_SIZE * x128::size) {
+            x128 dout128_i   = load128(dout_bt + i);
+            x128 inp128_i    = load128(inp_bt  + i);
+            x128 weight128_i = load128(weight  + i);
+            for (int k = 0; k < x128::size; k++) {
+                float norm_bti = ((float)inp128_i[k] - mean_bt) * rstd_bt;
+                float dnorm_i = (float)weight128_i[k] * (float)dout128_i[k];
+                dnorm_mean += dnorm_i;
+                dnorm_norm_mean += dnorm_i * norm_bti;
+            }
+        }
+        dnorm_mean = warpReduceSum(dnorm_mean) / C;
+        dnorm_norm_mean = warpReduceSum(dnorm_norm_mean) / C;
+
+        // now iterate again and accumulate all the gradients
+        // unfortunately we cannot use the same index for x128 arrays and shared memory
+        // as atomics can only be 32-bit rather than 128-bit (at least pre-SM90/Hopper)
+        // so this would result in an 8-way bank conflict, and kill performance
+        // so instead, we use a shared memory friendly index, and reorder before the final write
+        for (int i = 0; i < iterations_C; i++) {
+            int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration);
+            int shared_index = warpThreadIdx + (i * C_per_iteration);
+            if (global_index >= C) {
+                break;
+            }
+
+            x128 dout128   = load128cs(dout_bt + global_index);
+            x128 inp128    = load128cs(inp_bt  + global_index);
+            x128 dinp128   = load128(dinp_bt   + global_index);
+            x128 weight128 = load128(weight    + global_index);
+
+            for (int x = 0; x < x128::size; x++) {
+                float dout_i = (float)dout128[x];
+                float norm_bti = ((float)inp128[x] - mean_bt) * rstd_bt;
+                float dnorm_i = (float)weight128[x] * dout_i;
+
+                // sum up the gradients for bias and weight across the entire block
+                // this is basically a reduction (but only inter-warp, not intra-warp)
+                // doing it this way allows us to avoid using atomics while using many warps
+                if (warpId != 0) {
+                    dbias_tmp_shared[threadIdx.x] = dout_i;
+                    dweight_tmp_shared[threadIdx.x] = norm_bti * dout_i;
+                }
+                __syncthreads();
+                if (warpId == 0) {
+                    float dbias_tmp = dout_i;
+                    float dweight_tmp = norm_bti * dout_i;
+                    for (int j = 1; j < warpsInBlock; j++) {
+                        dbias_tmp += dbias_tmp_shared[threadIdx.x + j * WARP_SIZE];
+                        dweight_tmp += dweight_tmp_shared[threadIdx.x + j * WARP_SIZE];
+                    }
+                    // gradient contribution to bias (using shared memory friendly index)
+                    dbias_shared[shared_index + x*WARP_SIZE] += dbias_tmp;
+                    // gradient contribution to weight (using shared memory friendly index)
+                    dweight_shared[shared_index + x*WARP_SIZE] += dweight_tmp;
+                }
+                __syncthreads();
+
+                // gradient contribution to input
+                float dval = 0.0f;
+                dval += dnorm_i; // term 1
+                dval -= dnorm_mean; // term 2
+                dval -= norm_bti * dnorm_norm_mean; // term 3
+                dval *= rstd_bt; // final scale
+                dinp128[x] = (floatX)((float)dinp128[x] + dval);
+            }
+            // cache in L2 as this is read by the next kernel, but bypass L1 to minimise thrashing
+            store128cg(dinp_bt + global_index, dinp128);
+        }
+    }
+    __syncthreads();
+    // Each block writes its partial sum to global memory
+    // The last block to finish becomes responsible for summing up all the partial sums
+    // This is done by atomically incrementing a flag (cleared to 0 before launching the kernel)
+    unsigned int* scratchFlag = (unsigned int*)(scratch);
+    // Increment scratch pointer by a full cacheline so that everything remains cacheline aligned
+    scratch += 32;
+    float* scratch_dbias = scratch;
+    float* scratch_dweight = scratch + C;
+    for(int i = threadIdx.x; i < C; i+= BLOCK_SIZE) {
+        // Write to global memory in the same "shared memory banking friendly" order
+        scratch_dbias[i + 2*C*blockIdx.x] = dbias_shared[i];
+        scratch_dweight[i + 2*C*blockIdx.x] = dweight_shared[i];
+    }
+    __syncthreads();
+    if (threadIdx.x == 0) {
+        *tmp_flag = atomicInc(scratchFlag, gridDim.x);
+    }
+    __syncthreads();
+    if (*tmp_flag == gridDim.x-1) {
+        // Reduction of the partial sums by the final block
+        // todo - there isn't enough parallelism even inside that single SM...
+        // ==> so could maybe split into another kernel with YET ANOTHER level of reduction?!
+        for(int i = threadIdx.x * f128::size; i < C; i+= BLOCK_SIZE * f128::size) {
+            f128 dbias_accum(make_int4(0, 0, 0, 0));
+            f128 dweight_accum(make_int4(0, 0, 0, 0));
+
+            for (int read_block_idx = 0; read_block_idx < gridDim.x; read_block_idx++) {
+                int offset = i + 2*C*read_block_idx;
+                f128 dbias128 = load128(scratch_dbias + offset);
+                f128 dweight128 = load128(scratch_dweight + offset);
+                for(int k = 0; k < f128::size; k++) {
+                    dbias_accum[k] += dbias128[k];
+                    dweight_accum[k] += dweight128[k];
+                }
+            }
+            store128(dbias_shared + i, dbias_accum);
+            store128(dweight_shared + i, dweight_accum);
+        }
+        __syncthreads();
+
+        // reorder from atomic/shared memory-friendly index to real global memory index
+        // and convert from float/FP32 to floatX/BF16 for the final write
+        // this is separate also because it cannot use as many warps as the above (f128 vs x128)
+        // todo - if we split this code into another kernel, we could maybe do it at the same time?
+        for (int i = warpId; i < iterations_C; i += warpsInBlock) {
+            int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration);
+            int shared_index = warpThreadIdx + (i * C_per_iteration);
+            if (global_index >= C) {
+                break;
+            }
+
+            x128 dbias128 = load128(dbias + global_index);
+            x128 dweight128 = load128(dweight + global_index);
+            for (int x = 0; x < x128::size; x++) {
+                float s_db = dbias_shared[shared_index + x*WARP_SIZE];
+                float s_dw = dweight_shared[shared_index + x*WARP_SIZE];
+                dbias128[x] = (floatX)(s_db + (float)dbias128[x]);
+                dweight128[x] = (floatX)(s_dw + (float)dweight128[x]);
+            }
+            store128(dbias + global_index, dbias128);
+            store128(dweight + global_index, dweight128);
+        }
+    }
+}
+
 // ----------------------------------------------------------------------------
 // kernel launchers
 
@@ -828,6 +1112,32 @@ void layernorm_backward7(Tdinp* dinp, Tparams* dweight, Tparams* dbias, float* s
         layernorm_backward_kernel7<<<grid_size, block_size, shared_mem_size>>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C);
 }
 
+template <typename Tdinp, typename Tparams, typename Tdout, typename Trest>
+void layernorm_backward8(Tdinp* dinp, Tparams* dweight, Tparams* dbias, float* scratch,
+                        const Tdout* dout, const Trest* inp, const Tparams* weight, const Trest* mean, const Trest* rstd,
+                        int B, int T, int C, int block_size) {
+        const int grid_size = (1024/block_size) * cuda_num_SMs;
+        size_t shared_mem_size = (2 * C + 1) * sizeof(float);
+
+        // Including this as part of the timing until we can parallelise it
+        // It should fully hide the cost and improve kernel perf by >5% if done in parallel using CUDA streams
+        cudaMemset(scratch, 0, (1 + 2 * C) * sizeof(float));
+
+        layernorm_backward_kernel8<<<grid_size, block_size, shared_mem_size>>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C);
+}
+
+template <typename Tdinp, typename Tparams, typename Tdout, typename Trest>
+void layernorm_backward9(Tdinp* dinp, Tparams* dweight, Tparams* dbias, float* scratch,
+                        const Tdout* dout, const Trest* inp, const Tparams* weight, const Trest* mean, const Trest* rstd,
+                        int B, int T, int C, int block_size) {
+
+        const int grid_size = (1024/block_size) * cuda_num_SMs; // todo - heuristics for other GPUs?
+        size_t shared_mem_size = (2 * C + 2 * block_size + 1) * sizeof(float);
+
+        cudaMemset(scratch, 0, 1 * sizeof(float)); // just need to memset the flag for this version
+        layernorm_backward_kernel9<<<grid_size, block_size, shared_mem_size>>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C);
+}
+
 // kernel version dispatch
 void layernorm_backward(int kernel_num,
                         floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
@@ -860,6 +1170,12 @@ void layernorm_backward(int kernel_num,
         case 7:
             layernorm_backward7(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C, block_size);
             break;
+        case 8:
+            layernorm_backward8(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C, block_size);
+            break;
+        case 9:
+            layernorm_backward9(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C, block_size);
+            break;
     default:
             printf("Invalid kernel number\n");
             exit(1);
@@ -892,25 +1208,6 @@ int main(int argc, char **argv) {
     float *dbias = make_zeros_float(C);
     layernorm_backward_cpu(dinp, dweight, dbias, dout, inp, weight, mean, rstd, B, T, C);
 
-    // convert all the necessary cpu data to floatX (e.g. bfloat16)
-    floatX* meanX = (floatX*)malloc(B * T * sizeof(floatX));
-    floatX* rstdX = (floatX*)malloc(B * T * sizeof(floatX));
-    floatX* doutX = (floatX*)malloc(B * T * C * sizeof(floatX));
-    floatX* inpX = (floatX*)malloc(B * T * C * sizeof(floatX));
-    floatX* weightX = (floatX*)malloc(C * sizeof(floatX));
-
-    for (int i = 0; i < B * T; i++) {
-        meanX[i] = (floatX)mean[i];
-        rstdX[i] = (floatX)rstd[i];
-    }
-    for (int i = 0; i < B * T * C; i++) {
-        doutX[i] = (floatX)dout[i];
-        inpX[i] = (floatX)inp[i];
-    }
-    for (int i = 0; i < C; i++) {
-        weightX[i] = (floatX)weight[i];
-    }
-
     // the above calculations act as the reference
     // now let's do the same on the GPU
 
@@ -939,35 +1236,42 @@ int main(int argc, char **argv) {
     cudaCheck(cudaMalloc(&d_weight, C * sizeof(floatX)));
     cudaCheck(cudaMalloc(&d_mean, B * T * sizeof(floatX)));
     cudaCheck(cudaMalloc(&d_rstd, B * T * sizeof(floatX)));
-    cudaCheck(cudaMalloc(&d_scratch, cuda_num_SMs * (2 * C + 1) * sizeof(float)));
+    cudaCheck(cudaMalloc(&d_scratch, (1024/32) * cuda_num_SMs * (2 * C + 1) * sizeof(float)));
     // copy over the "inputs" to the backward call
-    cudaCheck(cudaMemcpy(d_dout, doutX, B * T * C * sizeof(floatX), cudaMemcpyHostToDevice));
-    cudaCheck(cudaMemcpy(d_inp, inpX, B * T * C * sizeof(floatX), cudaMemcpyHostToDevice));
-    cudaCheck(cudaMemcpy(d_weight, weightX, C * sizeof(floatX), cudaMemcpyHostToDevice));
-    cudaCheck(cudaMemcpy(d_mean, meanX, B * T * sizeof(floatX), cudaMemcpyHostToDevice));
-    cudaCheck(cudaMemcpy(d_rstd, rstdX, B * T * sizeof(floatX), cudaMemcpyHostToDevice));
-    // init the "outputs" of the backward call to zeros
-    cudaCheck(cudaMemset(d_dinp, 0, B * T * C * sizeof(floatX)));
-    cudaCheck(cudaMemset(d_dweight, 0, C * sizeof(floatX)));
-    cudaCheck(cudaMemset(d_dbias, 0, C * sizeof(floatX)));
+    cudaCheck(memcpy_convert(d_dout, dout, B * T * C));
+    cudaCheck(memcpy_convert(d_inp, inp, B * T * C));
+    cudaCheck(memcpy_convert(d_weight, weight, C));
+    cudaCheck(memcpy_convert(d_mean, mean, B * T));
+    cudaCheck(memcpy_convert(d_rstd, rstd, B * T));
 
     // launch the kernel
-    const int block_size = 256;
-    layernorm_backward(kernel_num, d_dinp, d_dweight, d_dbias, d_scratch, d_dout, d_inp, d_weight, d_mean, d_rstd, B, T, C, block_size);
-
-    // check the correctness of the kernel
-    float error_threshold_dinp = sizeof(floatX) == 4 ? 1e-3f : 1e-1f; // allow larger errors for BF16/FP16
-    float error_threshold_dparams = sizeof(floatX) == 4 ? 1e-3f : 20.0f; // much, much larger...
-    printf("Checking correctness...\n");
-    printf("dinp:\n");
-    validate_result(d_dinp, dinp, "dinp", B * T * C, error_threshold_dinp);
-    printf("dweight:\n");
-    validate_result(d_dweight, dweight, "dweight", C, error_threshold_dparams);
-    printf("dbias:\n");
-    validate_result(d_dbias, dbias, "dbias", C, error_threshold_dparams);
+    // removed 768 because it doesn't work for kernel9 despite being OK in train_gpt2.cu?!
+    int block_sizes[] = {32, 64, 128, 256, 512, /*768,*/ 1024};
+    for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
+        int block_size = block_sizes[j];
+        // init the "outputs" of the backward call to zeros
+        cudaCheck(cudaMemset(d_dinp, 0, B * T * C * sizeof(floatX)));
+        cudaCheck(cudaMemset(d_dweight, 0, C * sizeof(floatX)));
+        cudaCheck(cudaMemset(d_dbias, 0, C * sizeof(floatX)));
+
+        layernorm_backward(kernel_num, d_dinp, d_dweight, d_dbias, d_scratch, d_dout, d_inp, d_weight, d_mean, d_rstd,
+                           B, T, C, block_size);
+
+        // check the correctness of the kernel
+        float error_threshold_dinp = sizeof(floatX) == 4 ? 1e-3f : 1e-1f; // allow larger errors for BF16/FP16
+        float error_threshold_dparams = sizeof(floatX) == 4 ? 1e-3f : 5e-1f; // much, much larger...
+        printf("Checking correctness...\n");
+        printf("dinp:\n");
+        validate_result(d_dinp, dinp, "dinp", B * T * C, error_threshold_dinp);
+        printf("dweight:\n");
+        validate_result(d_dweight, dweight, "dweight", C, error_threshold_dparams);
+        printf("dbias:\n");
+        validate_result(d_dbias, dbias, "dbias", C, error_threshold_dparams);
+
+        printf("All results match for block_size=%d.\n\n", block_size);
+    }
 
     // now time the kernel
-    int block_sizes[] = {32, 64, 128, 256, 512, 1024};
     for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
         int block_size = block_sizes[j];
         int repeat_times = 100;
@@ -988,11 +1292,6 @@ int main(int argc, char **argv) {
     free(dinp);
     free(dweight);
     free(dbias);
-    free(meanX);
-    free(rstdX);
-    free(doutX);
-    free(inpX);
-    free(weightX);
     cudaCheck(cudaFree(d_dinp));
     cudaCheck(cudaFree(d_dweight));
     cudaCheck(cudaFree(d_dbias));
diff --git a/dev/cuda/layernorm_forward.cu b/dev/cuda/layernorm_forward.cu
index 5cefd408e..3e948289a 100644
--- a/dev/cuda/layernorm_forward.cu
+++ b/dev/cuda/layernorm_forward.cu
@@ -2,7 +2,7 @@
 Kernels for layernorm forward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math layernorm_forward.cu -o layernorm_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt layernorm_forward.cu -o layernorm_forward
 
 version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C
 ./layernorm_forward 1
diff --git a/dev/cuda/matmul_backward.cu b/dev/cuda/matmul_backward.cu
index 9d3763930..dece1f6dc 100644
--- a/dev/cuda/matmul_backward.cu
+++ b/dev/cuda/matmul_backward.cu
@@ -2,7 +2,7 @@
 Kernels for matmul backward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math -Xcompiler -fopenmp matmul_backward.cu -o matmul_backward -lcublas
+nvcc -O3 --use_fast_math -lcublas -lcublasLt -Xcompiler -fopenmp matmul_backward.cu -o matmul_backward
 
 OMP_NUM_THREADS=32 ./matmul_backward 1
 */
diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu
index 7aef54547..16172bcf2 100644
--- a/dev/cuda/matmul_backward_bias.cu
+++ b/dev/cuda/matmul_backward_bias.cu
@@ -2,7 +2,7 @@
 Kernels for matmul backward pass bias only.
 
 Compile example:
-nvcc -O3 matmul_backward_bias.cu -lineinfo -o matmul_backward_bias
+nvcc -O3 -lcublas -lcublasLt matmul_backward_bias.cu -lineinfo -o matmul_backward_bias
 
 ./matmul_backward_bias 1
 ./matmul_backward_bias 2
@@ -22,8 +22,31 @@ sudo ncu --set full --import-source yes -o bias -f ./matmul_backward_bias 1
 #include <omp.h>
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
+#include <type_traits>
+
+#define ENABLE_BF16
 #include "common.h"
 
+
+// ----------------------------------------------------------------------------
+// utility functions
+__host__ __device__ bool isPowerOfTwo(int n) {
+    return (n > 0) && ((n & (n - 1)) == 0);
+}
+
+__host__ __device__ int largestPowerOfTwoLessOrEqual(int n) {
+    // Return the largest power of 2 less than or equal to n
+    if (n < 1) {
+        return 0;
+    }
+
+    while ((n & (n - 1)) > 0) {
+        n = n & (n - 1);
+    }
+
+    return n;
+}
+
 // ----------------------------------------------------------------------------
 // CPU code reference
 
@@ -45,16 +68,18 @@ void matmul_backward_bias_cpu(float* dinp, float* dweight, float* dbias,
 // ----------------------------------------------------------------------------
 // GPU kernels
 
-__global__ void matmul_backward_bias_kernel1(float* dbias, const float* dout, int B, int T, int OC) {
+float* dbias_buffer;
+
+__global__ void matmul_backward_bias_kernel1(floatX* dbias, const floatX* dout, int B, int T, int OC) {
     extern __shared__ float shared[];
     int o = blockIdx.x; // range [0, OC)
     int tid = threadIdx.x; // range [0, block_size)
     int block_size = blockDim.x;
-    const float* x = dout + o;
+    const floatX* x = dout + o;
     // thread coarsening
     float sum = 0.0;
     for (int i = tid; i < B * T; i += block_size) {
-        sum += x[i * OC];
+        sum += (float)x[i * OC];
     }
     shared[tid] = sum;
     __syncthreads();
@@ -67,12 +92,12 @@ __global__ void matmul_backward_bias_kernel1(float* dbias, const float* dout, in
     }
     // write the final result (at thread 0) to global memory
     if (tid == 0) {
-        dbias[o] += shared[0];
+        dbias[o] = (float)dbias[o] + shared[0];
     }
 }
 
 // cooperative groups solution, one warp per output channel
-__global__ void matmul_backward_bias_kernel2(float* dbias, const float* dout, int B, int T, int OC) {
+__global__ void matmul_backward_bias_kernel2(floatX* dbias, const floatX* dout, int B, int T, int OC) {
     // dout is (B, T, OC), dbias is (OC)
     // e.g. if block_size = 128, then we have 4 warps per block, each in charge of one output channel
     namespace cg = cooperative_groups;
@@ -85,7 +110,7 @@ __global__ void matmul_backward_bias_kernel2(float* dbias, const float* dout, in
     // first, thread coarsening to sum reduce the problem size from B*T to 32
     float sum = 0.0f;
     for(int i = warp.thread_rank(); i < BT; i += warp.size()) {
-        sum += dout[i * OC + idx];
+        sum += (float)dout[i * OC + idx];
     }
     // now do a warp-level reduce to get the sum across the 32 threads in this warp
     sum = cg::reduce(warp, sum, cg::plus<float>{});
@@ -95,7 +120,7 @@ __global__ void matmul_backward_bias_kernel2(float* dbias, const float* dout, in
     }
 }
 
-__global__ void matmul_backward_bias_kernel3(float* dbias, const float* dout, int B, int T, int OC) {
+__global__ void matmul_backward_bias_kernel3(floatX* dbias, const floatX* dout, int B, int T, int OC) {
     // dout is (B, T, OC), dbias is (OC)
     // in this version of the kernel the entire block of block_size is dedicated to one output channel
     namespace cg = cooperative_groups;
@@ -110,7 +135,7 @@ __global__ void matmul_backward_bias_kernel3(float* dbias, const float* dout, in
     // round 1: thread coarsening to reduce the problem size from B*T to 32
     float thread_sum = 0.0f;
     for(int i = threadIdx.x; i < BT; i += blockDim.x) {
-        thread_sum += dout[i * OC + idx];
+        thread_sum += (float)dout[i * OC + idx];
     }
     // now do a warp-level reduce to get the sum across the 32 threads in each warp
     float warp_sum = cg::reduce(warp, thread_sum, cg::plus<float>{});
@@ -132,7 +157,7 @@ __global__ void matmul_backward_bias_kernel3(float* dbias, const float* dout, in
 // the idea is to employ one block to reduce along several columns,
 // where each block has a width of 32 columns to ensure coalesced access.
 // at the end we accumulate the reductions performed by the warps in each block via shared memory
-__global__ void matmul_backward_bias_kernel4(float* dbias, const float* dout, int B, int T, int OC) {
+__global__ void matmul_backward_bias_kernel4(floatX* dbias, const floatX* dout, int B, int T, int OC) {
     // this kernel is launched with 1D grid_dim of OC/32
     // for example let's say block_size is 128
     extern __shared__ float smem[]; // of size block_size (128)
@@ -143,7 +168,7 @@ __global__ void matmul_backward_bias_kernel4(float* dbias, const float* dout, in
 
     // pointer to the start of the column for one lane of threads
     // so e.g. 4 threads (of the same lane_id) will reduce this one column
-    const float* dout_col = dout + tl + lane_id;
+    const floatX* dout_col = dout + tl + lane_id;
 
     // column reductions by looping through the rows
     // each of the 4 threads offsets by its warp_id and then skips by vstep
@@ -152,7 +177,7 @@ __global__ void matmul_backward_bias_kernel4(float* dbias, const float* dout, in
     // leading to a coalesced memory access pattern
     float dout_sum = 0.0f;
     for (int row = warp_id; row < B * T; row += vstep) {
-        dout_sum += dout_col[row * OC];
+        dout_sum += (float)dout_col[row * OC];
     }
     smem[lane_id + warp_id * warpSize] = dout_sum;
     __syncthreads();
@@ -167,82 +192,399 @@ __global__ void matmul_backward_bias_kernel4(float* dbias, const float* dout, in
     }
 }
 
-__global__ void matmul_backward_bias_kernel5(float* dbias, const float* dout, int B, int T, int OC) {
+#ifndef ENABLE_BF16
+__global__ void matmul_backward_bias_kernel5(floatX* dbias, const floatX* dout, int B, int T, int OC) {
     int oc = blockIdx.x * blockDim.x + threadIdx.x;
     if(oc >= OC) return;
     float sum = 0.0;
     // grid-wide loop for maximum parallelism
     for (int i = blockIdx.y; i < B * T; i += gridDim.y) {
-        sum += dout[i * OC + oc];
+        sum += (float)dout[i * OC + oc];
     }
     // and atomically add everything together. atomics within one block are conflict-free!
     atomicAdd(dbias + oc, sum);
 }
+#endif
+
+
+__global__ void cast_and_add_kernel(floatX* dst, const float* src, size_t n) {
+    // used only for matmul_backward_bias kernel, a little bit embarassing TODO delete later
+    const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n) { dst[idx] = (floatX)((float)dst[idx] + src[idx]); } // have to += because dbias is a paramater
+}
+
+__global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, int B, int T, int OC, const int block_size) {
+    // note: this kernel reads in floatX, but it writes to float!
+    // this is because we're using atomics, which are super slow in < fp32 precision on < H100 GPUs
+    // so the trick is do fp32 atomics to a buffer, and then copy_and_cast the result to floatX
+    // (this also results in higher accuracy than doing accumulation directly in floatX)
+
+    // see comments in matmul_backward() for an explanation of block/grid dimensions etc.
+    const int block_size_x = 32;
+    const int block_size_y = block_size / block_size_x; // 16
+    const int OC_per_warp = block_size_x * x128::size;  // 256 at BF16
+
+    int local_oc = threadIdx.x * x128::size;
+    int global_oc = blockIdx.x * OC_per_warp + local_oc;
+    float accumulators[x128::size];
+    extern __shared__ float shared[];
+
+    for (int k = 0; k < x128::size; k++) {
+        accumulators[k] = 0.0f;
+    }
+    int thread_id = threadIdx.y * block_size_x + threadIdx.x;
+    for (int idx = thread_id; idx < OC_per_warp; idx += block_size) {
+        shared[idx] = 0.0f;
+    }
+    __syncthreads();
+    if(global_oc < OC) {
+        for (int idx = blockIdx.y*block_size_y + threadIdx.y; idx < B * T; idx += gridDim.y*block_size_y) {
+            x128 packed_dout = load128(dout + global_oc + idx*OC);
+            for (int k = 0; k < x128::size; k++) {
+                accumulators[k] += (float)packed_dout[k];
+            }
+        }
+        // we need to avoid shared memory bank conflicts for the atomicAdd to maximise performance,
+        // so we accumulate in a conflict-free order, then reorder to match the global memory order
+        for (int k = 0; k < x128::size; k++) {
+            atomicAdd(shared + threadIdx.x + (k * block_size_x), accumulators[k]);
+        }
+    }
+    if (threadIdx.y >= x128::size) { return; } // only need this many warps to reorder the data
+    __syncthreads();
+    // read the accumulated values in the conflict-free order
+    int i = threadIdx.x + (threadIdx.y * block_size_x);
+    float tmp = shared[i];
+    __syncthreads();
+    // write them back to shared memory in the global memory order
+    // 8-way bank conflict for BF16 x128, but only 8x per threadblock (rather than 8x per warp)
+    shared[local_oc + threadIdx.y] = tmp;
+    __syncthreads();
+    // now we do a perfectly coalesced atomic add to global memory (1x 128-byte cacheline per warp)
+    if (i + blockIdx.x*OC_per_warp < OC) {
+        atomicAdd(dbias + i + blockIdx.x*OC_per_warp, shared[i]);
+    }
+}
+
+// We want to decrease the amount of channels handled by each block, so that we need fewer across-block reductions.
+// We do this by realizing the following: For scalar memory access, we need to read one element per thread in a warp
+// to read an entire cacheline, but for vectorized memory access, with 128 bit of data per thread, we only need eight
+// threads to fetch a cacheline, which means that we can already operate on a "depth" of four within a single warp.
+// => blockDim.x == 4, blockDim.y == 32/4 = 8
+//
+template<typename OutFloat, bool Atomic>
+__global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout, int B, int T, int OC,
+                                             std::bool_constant<Atomic>) {
+    constexpr const int bdx = 4;
+    constexpr const int bdy = 32 / bdx;
+    assert(blockDim.x == bdx);
+    assert(blockDim.y == bdy);
+
+    int warp_d = (int)threadIdx.x;
+    int warp_c = (int)threadIdx.y;
+    int block_d = (int)threadIdx.z;
+
+    const int OC_per_warp = bdy * x128::size;  // 64 at BF16
+
+    int local_oc = warp_c * x128::size;
+    int global_oc = blockIdx.x * OC_per_warp + local_oc;
+
+    int local_bt = warp_d + bdx * block_d;
+    int bt_per_block = bdx * blockDim.z;
+
+    float accumulators[x128::size];
+    for (int k = 0; k < x128::size; k++) {
+        accumulators[k] = 0.0f;
+    }
+
+    if(global_oc < OC) {
+        // sum up over all bt within registers
+        for (int idx = blockIdx.y * bt_per_block + local_bt; idx < B * T; idx += gridDim.y * bt_per_block) {
+            x128 packed_dout = load128(dout + global_oc + idx*OC);
+            for (int k = 0; k < x128::size; k++) {
+                accumulators[k] += (float)packed_dout[k];
+            }
+        }
+    }
+
+    __shared__ float sub_results[x128::size][32][bdy];
+
+    // reduce within-warp results
+    for (int k = 0; k < x128::size; k++) {
+        float v = accumulators[k];
+        v += __shfl_down_sync(0xffffffff, v, 1, 4);
+        v += __shfl_down_sync(0xffffffff, v, 2, 4);
+        if(warp_d == 0) {
+            sub_results[k][block_d][warp_c] = v;
+        }
+    }
+    __syncthreads();
+
+    // block-wide reductions
+    for (int k = block_d; k < x128::size; k += blockDim.z) {
+        float a = 0.f;
+        for (int r = warp_d; r < blockDim.z; r += bdx) {
+            float v = sub_results[k][r][warp_c];
+            v += __shfl_down_sync(0xffffffff, v, 1, 4);
+            v += __shfl_down_sync(0xffffffff, v, 2, 4);
+            a += v;
+        }
+        if(warp_d == 0 && global_oc < OC) {
+            // coalesced, but not cacheline-sized
+            if constexpr (!Atomic) {
+                dbias[global_oc + k] = (OutFloat)(a + (float)dbias[global_oc + k]);
+            } else {
+                atomicAdd(dbias + global_oc + k, a);
+            }
+        }
+    }
+}
+
+// Like kernel 8, but instead of accumulating to the auxiliary buffer, it writes
+// multiple values that need to be summed up in a separate kernel call.
+// If UseAuxBuffer is false, gridDim.y has to be one, and results are added directly
+// to dbias.
+template<typename OutFloat, bool UseAuxBuffer>
+__global__ void matmul_backward_bias_kernel9(OutFloat* dbias, const floatX* dout, int B, int T, int OC,
+                                             std::bool_constant<UseAuxBuffer>) {
+    constexpr const int bdx = 4;
+    constexpr const int bdy = 32 / bdx;
+    assert(blockDim.x == bdx);
+    assert(blockDim.y == bdy);
+
+    int warp_d = (int)threadIdx.x;
+    int warp_c = (int)threadIdx.y;
+    int block_d = (int)threadIdx.z;
+
+    const int OC_per_warp = bdy * x128::size;  // 64 at BF16
+
+    int local_oc = warp_c * x128::size;
+    int global_oc = blockIdx.x * OC_per_warp + local_oc;
+
+    int local_bt = warp_d + bdx * block_d;
+    int bt_per_block = bdx * blockDim.z;
+
+    float accumulators[x128::size];
+    for (int k = 0; k < x128::size; k++) {
+        accumulators[k] = 0.0f;
+    }
+
+    if(global_oc < OC) {
+        // sum up over all bt within registers
+        for (int idx = blockIdx.y * bt_per_block + local_bt; idx < B * T; idx += gridDim.y * bt_per_block) {
+            x128 packed_dout = load128(dout + global_oc + idx*OC);
+            for (int k = 0; k < x128::size; k++) {
+                accumulators[k] += (float)packed_dout[k];
+            }
+        }
+    }
+
+    __shared__ float sub_results[x128::size][32][bdy];
+
+    // reduce within-warp results
+    for (int k = 0; k < x128::size; k++) {
+        float v = accumulators[k];
+        v += __shfl_down_sync(0xffffffff, v, 1, 4);
+        v += __shfl_down_sync(0xffffffff, v, 2, 4);
+        if(warp_d == 0) {
+            sub_results[k][block_d][warp_c] = v;
+        }
+    }
+    __syncthreads();
+
+    // block-wide reductions
+    for (int k = block_d; k < x128::size; k += blockDim.z) {
+        float a = 0.f;
+        for (int r = warp_d; r < blockDim.z; r += bdx) {
+            float v = sub_results[k][r][warp_c];
+            v += __shfl_down_sync(0xffffffff, v, 1, 4);
+            v += __shfl_down_sync(0xffffffff, v, 2, 4);
+            a += v;
+        }
+        if(warp_d == 0 && global_oc < OC) {
+            // coalesced, but not cacheline-sized
+            if constexpr (!UseAuxBuffer) {
+                dbias[global_oc + k] = (OutFloat)(a + (float)dbias[global_oc + k]);
+            } else {
+                dbias[global_oc + k + blockIdx.y * OC] = a;
+            }
+        }
+    }
+}
+
+
+__global__ void reduce_add_sum_kernel(floatX* dst, const float* src, size_t n, size_t m) {
+    const size_t idx = (blockIdx.x * blockDim.x + threadIdx.x) * f128::size;
+    assert(n % x128::size == 0);
+    if (idx < n) {
+        f128 acc;
+        for(int k = 0; k < f128::size; ++k) {
+            acc[k] = 0.f;
+        }
+
+        for(int l = 0; l < m; ++l) {
+            f128 s = load128(src + idx + n * l);
+            for(int k = 0; k < f128::size; ++k) {
+                acc[k] += s[k];
+            }
+        }
+        for(int k = 0; k < f128::size; ++k) {
+            dst[idx + k] = (floatX) ((float)dst[idx + k] + acc[k]);
+        }
+    }
+}
 
 
 // ----------------------------------------------------------------------------
 // kernel launcher
 
 // version1: simple cuBLAS calls
-void matmul_backward_bias1(float* dinp, float* dweight, float* dbias,
-                      float* dout, float* inp, float* weight, float* ones,
-                      int B, int T, int C, int OC, int block_size) {
+void matmul_backward_bias1(floatX* dbias, const floatX* dout,
+                      int B, int T, int OC, int block_size) {
+    block_size = largestPowerOfTwoLessOrEqual(block_size);
+    assert(isPowerOfTwo(block_size)); // block_size needs to be power of 2 due to the reduction
     dim3 block_dim(block_size);
     dim3 grid_dim(OC);
     size_t shared_mem_size = block_size * sizeof(float);
     matmul_backward_bias_kernel1<<<grid_dim, block_dim, shared_mem_size>>>(dbias, dout, B, T, OC);
+    cudaCheck(cudaGetLastError());
 }
 
-void matmul_backward_bias2(float* dinp, float* dweight, float* dbias,
-                      float* dout, float* inp, float* weight, float* ones,
-                      int B, int T, int C, int OC, int block_size) {
+void matmul_backward_bias2(floatX* dbias, const floatX* dout,
+                      int B, int T, int OC, int block_size) {
     // block_size 512 seems best
     const int grid_size = ceil_div(OC * 32, block_size);
     matmul_backward_bias_kernel2<<<grid_size, block_size>>>(dbias, dout, B, T, OC);
+    cudaCheck(cudaGetLastError());
 }
 
-void matmul_backward_bias3(float* dinp, float* dweight, float* dbias,
-                      float* dout, float* inp, float* weight, float* ones,
-                      int B, int T, int C, int OC, int block_size) {
+void matmul_backward_bias3(floatX* dbias, const floatX* dout,
+                      int B, int T, int OC, int block_size) {
     // block_size 256 seems best
     matmul_backward_bias_kernel3<<<OC, block_size>>>(dbias, dout, B, T, OC);
+    cudaCheck(cudaGetLastError());
 }
 
-void matmul_backward_bias4(float* dinp, float* dweight, float* dbias,
-                      float* dout, float* inp, float* weight, float* ones,
-                      int B, int T, int C, int OC, int block_size) {
+void matmul_backward_bias4(floatX* dbias, const floatX* dout,
+                      int B, int T, int OC, int block_size) {
     assert(OC % 32 == 0); // OC must be divisible by 32 for this kernel
     const int grid_size = OC / 32;
     matmul_backward_bias_kernel4<<<grid_size, block_size, block_size * sizeof(float)>>>(dbias, dout, B, T, OC);
+    cudaCheck(cudaGetLastError());
 }
 
-void matmul_backward_bias5(float* dinp, float* dweight, float* dbias,
-                      float* dout, float* inp, float* weight, float* ones,
-                      int B, int T, int C, int OC, int block_size) {
+#ifndef ENABLE_BF16
+void matmul_backward_bias5(floatX* dbias, const floatX* dout,
+                      int B, int T, int OC, int block_size) {
     const int grid_size_x = ceil_div(OC, block_size);
     const int grid_size_y = max(1, cuda_threads_per_SM * cuda_num_SMs / block_size);
     matmul_backward_bias_kernel5<<<dim3(grid_size_x, grid_size_y), dim3(block_size)>>>(dbias, dout, B, T, OC);
+    cudaCheck(cudaGetLastError());
+}
+#endif
+
+void matmul_backward_bias7(floatX* dbias, const floatX* dout,
+                      int B, int T, int OC, int block_size) {
+    if(block_size < 256) {
+        block_size = 256;
+    }
+    // Each warp is responsible for 32 * "x128::size" = 256 OCs at BF16 (OC must be a multiple of 256!)
+    // Block size is 512 threads (16 warps) and we reduce those 16 values into 1 at the end
+    // blockDim.x is 32 --> single warp being responsible for those 256 OCs
+    // blockDim.y is 16 --> 16 parallel independent warps processing the same OCs for different BTs
+    // gridDim.x is OC / 256 --> each block processes 256 OCs
+    // grimDim.y is max(1, (cuda_num_SMs * threads_per_SM) / (512 * gridDim.x)); --> fill up the entire GPU!
+    const int warp_size = 32;
+    const int OC_per_warp = warp_size * x128::size; // 256 at BF16
+    const int block_size_x = 32;
+    const int block_size_y = block_size / block_size_x; // 16
+    const int grid_size_x = ceil_div(OC, OC_per_warp); // e.g. 3 horizontal blocks for 768 OCs at BF16
+    const int grid_size_y = max(1, cuda_threads_per_SM * cuda_num_SMs / (block_size * grid_size_x)); // full GPU!
+
+    assert(block_size_y >= x128::size); // part of the kernel assumes this is large enough to avoid loops
+
+    cudaCheck(cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float)));
+    matmul_backward_bias_kernel7<<<dim3(grid_size_x, grid_size_y),
+        dim3(block_size_x, block_size_y), OC_per_warp * sizeof(float)>>>(dbias_buffer, dout, B, T, OC, block_size);
+    cudaCheck(cudaGetLastError());
+    cast_and_add_kernel<<<ceil_div(OC, 256), 256, 0>>>(dbias, dbias_buffer, OC);
+    cudaCheck(cudaGetLastError());
 }
 
-void matmul_backward_bias(int kernel_num,
-                     float* dinp, float* dweight, float* dbias,
-                     float* dout, float* inp, float* weight, float* ones,
-                     int B, int T, int C, int OC, int block_size) {
+void matmul_backward_bias8(floatX* dbias, const floatX* dout,
+                      int B, int T, int OC, int block_size) {
+    dim3 block_dim = {4, 8, (unsigned)block_size/32};
+    const int OC_per_warp = block_dim.y * x128::size; // 64 at BF16
+    const int grid_size_x = ceil_div(OC, OC_per_warp); // e.g. 12 horizontal blocks for 768 OCs at BF16
+    const int grid_size_y = max(1, cuda_threads_per_SM * cuda_num_SMs / (block_size * grid_size_x)); // full GPU!
+
+    // If we have enough OC that we don't need cross-block reductions, we can skip the bias_buffer accumulation
+    // and write results directly to the output.
+    if(grid_size_y == 1) {
+        matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias, dout, B, T, OC, std::bool_constant<false>{});
+        cudaCheck(cudaGetLastError());
+    } else {
+        cudaCheck(cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float)));
+        matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias_buffer, dout, B, T, OC, std::bool_constant<true>{});
+        cudaCheck(cudaGetLastError());
+        cast_and_add_kernel<<<ceil_div(OC, 256), 256, 0>>>(dbias, dbias_buffer, OC);
+        cudaCheck(cudaGetLastError());
+    }
+}
+
+
+void matmul_backward_bias9(floatX* dbias, const floatX* dout,
+                           int B, int T, int OC, int block_size) {
+    dim3 block_dim = {4, 8, (unsigned)block_size/32};
+    const int OC_per_warp = block_dim.y * x128::size; // 64 at BF16
+    const int grid_size_x = ceil_div(OC, OC_per_warp); // e.g. 12 horizontal blocks for 768 OCs at BF16
+    const int grid_size_y = max(1, cuda_threads_per_SM * cuda_num_SMs / (block_size * grid_size_x)); // full GPU!
+
+    // If we have enough OC that we don't need cross-block reductions, we can skip the bias_buffer accumulation
+    // and write results directly to the output.
+    if(grid_size_y == 1) {
+        matmul_backward_bias_kernel9<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias, dout, B, T, OC, std::bool_constant<false>{});
+        cudaCheck(cudaGetLastError());
+    } else {
+        // kernel 9 overwrites temp buffer, so no need to memset
+        matmul_backward_bias_kernel9<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias_buffer, dout, B, T, OC, std::bool_constant<true>{});
+        cudaCheck(cudaGetLastError());
+        reduce_add_sum_kernel<<<ceil_div(OC, 256 * f128::size), 256, 0>>>(dbias, dbias_buffer, OC, grid_size_y);
+        cudaCheck(cudaGetLastError());
+    }
+}
+
+void matmul_backward_bias(int kernel_num, floatX* dbias, floatX* dout,
+                     int B, int T, int OC, int block_size) {
     switch (kernel_num) {
         case 1:
-            matmul_backward_bias1(dinp, dweight, dbias, dout, inp, weight, ones, B, T, C, OC, block_size);
+            matmul_backward_bias1(dbias, dout, B, T, OC, block_size);
             break;
         case 2:
-            matmul_backward_bias2(dinp, dweight, dbias, dout, inp, weight, ones, B, T, C, OC, block_size);
+            matmul_backward_bias2(dbias, dout, B, T, OC, block_size);
             break;
         case 3:
-            matmul_backward_bias3(dinp, dweight, dbias, dout, inp, weight, ones, B, T, C, OC, block_size);
+            matmul_backward_bias3(dbias, dout,  B, T, OC, block_size);
             break;
         case 4:
-            matmul_backward_bias4(dinp, dweight, dbias, dout, inp, weight, ones, B, T, C, OC, block_size);
+            matmul_backward_bias4(dbias, dout, B, T, OC, block_size);
             break;
         case 5:
-            matmul_backward_bias5(dinp, dweight, dbias, dout, inp, weight, ones, B, T, C, OC, block_size);
+#ifndef ENABLE_BF16
+            matmul_backward_bias5(dbias, dout, B, T, OC, block_size);
+#else
+            fprintf(stderr, "Kernel 5 is only supported for fp32");
+            exit(1);
+#endif
+            break;
+        case 7:
+            matmul_backward_bias7(dbias, dout, B, T, OC, block_size);
+            break;
+        case 8:
+            matmul_backward_bias8(dbias, dout, B, T, OC, block_size);
+            break;
+        case 9:
+            matmul_backward_bias9(dbias, dout, B, T, OC, block_size);
             break;
         default:
             printf("Invalid kernel number\n");
@@ -272,12 +614,13 @@ int main(int argc, char **argv) {
     float* dout = make_random_float(B * T * OC);
 
     // move to GPU
-    float* d_dbias;
-    float* d_dout;
-    cudaCheck(cudaMalloc(&d_dbias, OC * sizeof(float)));
-    cudaCheck(cudaMalloc(&d_dout, B * T * OC * sizeof(float)));
-    cudaCheck(cudaMemcpy(d_dbias, dbias, OC * sizeof(float), cudaMemcpyHostToDevice));
-    cudaCheck(cudaMemcpy(d_dout, dout, B * T * OC * sizeof(float), cudaMemcpyHostToDevice));
+    floatX* d_dbias;
+    floatX* d_dout;
+    cudaCheck(cudaMalloc(&d_dbias, OC * sizeof(floatX)));
+    cudaCheck(cudaMalloc(&d_dout, B * T * OC * sizeof(floatX)));
+    cudaCheck(cudaMalloc(&dbias_buffer, OC * sizeof(float) * 32));
+    cudaCheck(memcpy_convert(d_dbias, dbias, OC));
+    cudaCheck(memcpy_convert(d_dout, dout, B * T * OC));
 
     // ncu debugging / profiling, do a single call
     // int block_size_debug;
@@ -288,7 +631,7 @@ int main(int argc, char **argv) {
     // matmul_backward_bias(kernel_num, NULL, NULL, d_dbias, d_dout, NULL, NULL, NULL, B, T, C, OC, block_size_debug);
     // exit(EXIT_SUCCESS);
 
-    int block_sizes[] = {32, 64, 128, 256, 512, 1024};
+    int block_sizes[] = {32, 64, 128, 256, 512, 768, 1024};
 
     // calculate the CPU reference
     matmul_backward_bias_cpu(NULL, NULL, dbias, dout, NULL, NULL, B, T, C, OC);
@@ -296,23 +639,22 @@ int main(int argc, char **argv) {
     for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
         int block_size = block_sizes[j];
         // memset the bias to zero
-        cudaCheck(cudaMemset(d_dbias, 0, OC * sizeof(float)));
+        cudaCheck(cudaMemset(d_dbias, 0, OC * sizeof(floatX)));
         // calculate the GPU version
-        matmul_backward_bias(kernel_num, NULL, NULL, d_dbias, d_dout, NULL, NULL, NULL, B, T, C, OC, block_size);
+        matmul_backward_bias(kernel_num, d_dbias, d_dout, B, T, OC, block_size);
         // compare
         printf("Checking correctness...\n");
-        validate_result(d_dbias, dbias, "dbias", OC, 5e-3f);
+        float tol = std::is_same_v<floatX, float> ? 5e-3f : 1.0f;
+        validate_result(d_dbias, dbias, "dbias", OC, tol);
         printf("All results match for block_size=%d.\n\n", block_size);
     }
 
     // now benchmark the kernel
     for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
         int block_size = block_sizes[j];
-        float *d_dinp, *d_dweight, *d_inp, *d_weight, *d_ones;
         int repeat_times = 2000;
         float elapsed_time = benchmark_kernel(repeat_times, matmul_backward_bias, kernel_num,
-                                            d_dinp, d_dweight, d_dbias, d_dout, d_inp, d_weight, d_ones,
-                                            B, T, C, OC, block_size);
+                                            d_dbias, d_dout, B, T, OC, block_size);
         printf("block_size %d time %.4f ms\n", block_size, elapsed_time);
     }
 
diff --git a/dev/cuda/nccl_all_reduce.cu b/dev/cuda/nccl_all_reduce.cu
index 3bc9564f1..260ba02ba 100644
--- a/dev/cuda/nccl_all_reduce.cu
+++ b/dev/cuda/nccl_all_reduce.cu
@@ -5,7 +5,7 @@ Fills a vector with 1s on the first GPU, 2s on the second, etc.
 Then aggregates the values in the resulting vectors.
 
 Compile example:
-nvcc -lmpi -lnccl -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-gnu/openmpi/lib/ nccl_all_reduce.cu -o nccl_all_reduce
+nvcc -lmpi -lnccl -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-gnu/openmpi/lib/ -lcublas -lcublasLt nccl_all_reduce.cu -o nccl_all_reduce
 
 Run on 2 local GPUs (set -np to a different value to change GPU count):
 mpirun -np 2 ./nccl_all_reduce
diff --git a/dev/cuda/residual_forward.cu b/dev/cuda/residual_forward.cu
index bbbcde270..fd7d1fb8e 100644
--- a/dev/cuda/residual_forward.cu
+++ b/dev/cuda/residual_forward.cu
@@ -2,7 +2,7 @@
 Kernels for residual forward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math residual_forward.cu -o residual_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt residual_forward.cu -o residual_forward
 
 version 1 is naive port from CPU code to kernel
 ./residual_forward 1
@@ -13,23 +13,10 @@ version 2 packs input into 128 bit memory reads
 #include <stdio.h>
 #include <stdlib.h>
 #include <cuda_runtime.h>
-#include "common.h"
 
-// turn on bf16 as default, done up here for now
 #define ENABLE_BF16
+#include "common.h"
 
-#if defined(ENABLE_BF16)
-typedef __nv_bfloat16 floatX;
-typedef __nv_bfloat16 floatN;
-#elif defined(ENABLE_FP16)
-typedef half floatX;
-typedef half floatN;
-#else
-typedef float floatX;
-typedef float floatN;
-#endif
-
-typedef Packed128<floatX> x128;
 // ----------------------------------------------------------------------------
 // CPU code reference lol
 
diff --git a/dev/cuda/softmax_forward.cu b/dev/cuda/softmax_forward.cu
index f611864f0..e6b46167e 100644
--- a/dev/cuda/softmax_forward.cu
+++ b/dev/cuda/softmax_forward.cu
@@ -2,7 +2,7 @@
 Kernels for softmax forward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math softmax_forward.cu -o softmax_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt softmax_forward.cu -o softmax_forward
 
 version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C
 ./softmax_forward 1
@@ -182,14 +182,6 @@ __device__ float warpReduceMax(float val) {
     return val;
 }
 
-// warp-level reduction for summing values
-__device__ float warpReduceSum(float val) {
-    for (int offset = 16; offset > 0; offset /= 2) {
-        val += __shfl_down_sync(0xFFFFFFFF, val, offset);
-    }
-    return val;
-}
-
 __global__ void softmax_forward_kernel3(float* out, const float* inp, int N, int C) {
     // kernel must use block size of 32
     extern __shared__ float shared[];
@@ -514,6 +506,66 @@ __global__ void softmax_forward_kernel7(float* out, const float* inp, int N, int
     }
 }
 
+__global__ void softmax_forward_online_kernel8(float* out, const float* inp, int N, int C) {
+    // online softmax paper: http://arxiv.org/abs/1805.02867
+    // online softmax reduces loops from 3 to 2
+    // which is done by calculating sumval and maxval in one loop
+    const int warpsPerBlock = blockDim.x / warpSize;
+    int tid = threadIdx.x;
+
+    if (tid >= C) {
+        return;
+    }
+
+    int warpId = tid / warpSize;
+    int laneId = tid % warpSize;
+    // one warp one row
+    int row = blockIdx.x * warpsPerBlock + warpId;
+
+    if (row >= N) {
+        return;
+    }
+
+    const float* x = inp + row * C;
+    float* const y = out + row * C;
+
+    // merge calculating maxval and sumval in one loop
+    // which is an arithmetic improvment from online softmax over normal softmax
+    float maxval = -INFINITY, sumval = 0.0f, bigger;
+    for (int i = laneId; i < C; i += warpSize) {
+        // when updating the maxval, dynamically updates the previous sumval by
+        // multiplying e^{previous_maxval - current_maxval}
+        bigger = fmaxf(maxval, x[i]);
+        sumval = sumval * expf(maxval - bigger) + expf(x[i] - bigger);
+        maxval = bigger;
+    }
+
+    // use warp functions instead of cooperative groups for better readibility
+    // calculate the warp wised maxval and sumval
+    float offsetMaxval, offsetSumval;
+    for (int offset = warpSize / 2; offset > 0; offset >>= 1) {
+        __syncwarp();
+        offsetMaxval = __shfl_down_sync(0xFFFFFFFF, maxval, offset);
+        offsetSumval = __shfl_down_sync(0xFFFFFFFF, sumval, offset);
+        if (offsetMaxval > maxval) {
+            sumval *= expf(maxval - offsetMaxval);
+            maxval = offsetMaxval;
+        } else {
+            offsetSumval *= expf(offsetMaxval - maxval);
+        }
+        sumval += offsetSumval;
+    }
+
+    // sync the warp wised maxval and sumval
+    // which are also the maxval and sumval of one row in C
+    maxval = __shfl_sync(0xFFFFFFFF, maxval, 0);
+    sumval = __shfl_sync(0xFFFFFFFF, sumval, 0);
+
+    for (int i = laneId; i < C; i += warpSize) {
+        y[i] = expf(x[i] - maxval) / sumval;
+    }
+}
+
 // ----------------------------------------------------------------------------
 // kernel launcher
 
@@ -560,6 +612,12 @@ void softmax_forward7(float* out, const float* inp, int N, int C, int block_size
     softmax_forward_kernel7<<<grid_size, block_size, shared_mem_size>>>(out, inp, N, C);
 }
 
+void softmax_forward_online8(float* out, const float* inp, int N, int C, int block_size) {
+    const int grid_size = ceil_div(N * 32, block_size);
+    softmax_forward_online_kernel8<<<grid_size, block_size>>>(out, inp, N, C);
+    cudaCheck(cudaGetLastError());
+}
+
 // kernel version dispatch
 void softmax_forward(int kernel_num, float* out, const float* inp, int N, int C, const int block_size) {
     switch (kernel_num) {
@@ -584,6 +642,9 @@ void softmax_forward(int kernel_num, float* out, const float* inp, int N, int C,
         case 7:
             softmax_forward7(out, inp, N, C, block_size);
             break;
+        case 8:
+            softmax_forward_online8(out, inp, N, C, block_size);
+            break;
         default:
             printf("Invalid kernel number\n");
             exit(1);
diff --git a/dev/cuda/trimat_forward.cu b/dev/cuda/trimat_forward.cu
index 133ced16f..1c093e2a1 100644
--- a/dev/cuda/trimat_forward.cu
+++ b/dev/cuda/trimat_forward.cu
@@ -3,7 +3,7 @@ Triangular matrix multiplication as in autoregressive attention. A short story.
 by @ngc92
 
 Compile:
-nvcc -O3 --use_fast_math trimat_forward.cu -o trimat_forward -lcublas
+nvcc -O3 --use_fast_math -lcublas -lcublasLt trimat_forward.cu -o trimat_forward -lcublas
 
 Run:
 
diff --git a/dev/data/README.md b/dev/data/README.md
new file mode 100644
index 000000000..b13675c1e
--- /dev/null
+++ b/dev/data/README.md
@@ -0,0 +1,8 @@
+# dev/data organization
+
+The idea is that each dataset has a .py file here in the root of `dev/data`, and each dataset then creates a directory here, and writes and caches anything inside that directory. So for example:
+
+- running `python tinystories.py` will create a directory `tinystories` with its .bin files inside it
+- running `python tinyshakespeare.py` will create a directory `tinyshakespeare` with its .bin files inside it
+
+And so on. This way we can nicely organize multiple datasets here, share common utilities between them, and then point the .py/.c code in the root of the project accordingly to these.
diff --git a/dev/data/data_common.py b/dev/data/data_common.py
new file mode 100644
index 000000000..b22d2f685
--- /dev/null
+++ b/dev/data/data_common.py
@@ -0,0 +1,111 @@
+"""
+Common utilities for the datasets
+"""
+
+import requests
+from tqdm import tqdm
+import numpy as np
+
+
+def download_file(url: str, fname: str, chunk_size=1024):
+    """Helper function to download a file from a given url"""
+    resp = requests.get(url, stream=True)
+    total = int(resp.headers.get("content-length", 0))
+    with open(fname, "wb") as file, tqdm(
+        desc=fname,
+        total=total,
+        unit="iB",
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as bar:
+        for data in resp.iter_content(chunk_size=chunk_size):
+            size = file.write(data)
+            bar.update(size)
+
+
+def write_datafile(filename, toks):
+    """
+    Saves token data as a .bin file, for reading in C.
+    - First comes a header with 256 int32s
+    - The tokens follow, each as a uint16
+    """
+    assert len(toks) < 2**31, "token count too large" # ~2.1B tokens
+    # construct the header
+    header = np.zeros(256, dtype=np.int32)
+    header[0] = 20240520 # magic
+    header[1] = 1 # version
+    header[2] = len(toks) # number of tokens after the 256*4 bytes of header (each 2 bytes as uint16)
+    # construct the tokens numpy array, if not already
+    if not isinstance(toks, np.ndarray) or not toks.dtype == np.uint16:
+        # validate that no token exceeds a uint16
+        maxtok = 2**16
+        assert all(0 <= t < maxtok for t in toks), "token dictionary too large for uint16"
+        toks_np = np.array(toks, dtype=np.uint16)
+    else:
+        toks_np = toks
+    # write to file
+    print(f"writing {len(toks):,} tokens to {filename}")
+    with open(filename, "wb") as f:
+        f.write(header.tobytes())
+        f.write(toks_np.tobytes())
+
+def write_evalfile(filename, datas):
+    """
+    Saves eval data as a .bin file, for reading in C.
+    Used for multiple-choice style evals, e.g. HellaSwag and MMLU
+    - First comes a header with 256 int32s
+    - The examples follow, each example is a stream of uint16_t:
+        - <START_EXAMPLE> delimiter of 2**16-1, i.e. 65,535
+        - <EXAMPLE_BYTES>, bytes encoding this example, allowing efficient skip to next
+        - <EXAMPLE_INDEX>, the index of the example in the dataset
+        - <LABEL>, the index of the correct completion
+        - <NUM_COMPLETIONS>, indicating the number of completions (usually 4)
+        - <NUM><CONTEXT_TOKENS>, where <NUM> is the number of tokens in the context
+        - <NUM><COMPLETION_TOKENS>, repeated NUM_COMPLETIONS times
+    """
+    # construct the header
+    header = np.zeros(256, dtype=np.int32)
+    header[0] = 20240522 # magic
+    header[1] = 1 # version
+    header[2] = len(datas) # number of examples
+    header[3] = 0 # reserved for longest_example_bytes, fill in later
+    # now write the individual examples
+    longest_example_bytes = 0 # in units of uint16s
+    full_stream = [] # the stream of uint16s, we'll write a single time at the end
+    assert len(datas) < 2**16, "too many examples?"
+    for idx, data in enumerate(datas):
+        stream = []
+        # header of the example
+        stream.append(2**16-1) # <START_EXAMPLE>
+        stream.append(0) # <EXAMPLE_BYTES> (fill in later)
+        stream.append(idx) # <EXAMPLE_INDEX>
+        stream.append(data["label"]) # <LABEL>
+        ending_tokens = data["ending_tokens"]
+        assert len(ending_tokens) == 4, "expected 4 completions for now? can relax later"
+        stream.append(len(ending_tokens)) # <NUM_COMPLETIONS>
+        # the (shared) context tokens
+        ctx_tokens = data["ctx_tokens"]
+        assert all(0 <= t < 2**16-1 for t in ctx_tokens), "bad context token"
+        stream.append(len(ctx_tokens))
+        stream.extend(ctx_tokens)
+        # the completion tokens
+        for end_tokens in ending_tokens:
+            assert all(0 <= t < 2**16-1 for t in end_tokens), "bad completion token"
+            stream.append(len(end_tokens))
+            stream.extend(end_tokens)
+        # write to full stream
+        nbytes = len(stream)*2 # 2 bytes per uint16
+        assert nbytes < 2**16, "example too large?"
+        stream[1] = nbytes # fill in the <EXAMPLE_BYTES> field
+        longest_example_bytes = max(longest_example_bytes, nbytes)
+        full_stream.extend(stream)
+    # construct the numpy array
+    stream_np = np.array(full_stream, dtype=np.uint16)
+    # fill in the longest_example field
+    assert 0 < longest_example_bytes < 2**16, f"bad longest_example"
+    header[3] = longest_example_bytes
+    # write to file (for HellaSwag val this is 10,042 examples, 3.6MB file)
+    print(f"writing {len(datas):,} examples to {filename}")
+    with open(filename, "wb") as f:
+        f.write(header.tobytes())
+        f.write(stream_np.tobytes())
diff --git a/dev/data/fineweb.py b/dev/data/fineweb.py
new file mode 100644
index 000000000..891096ec4
--- /dev/null
+++ b/dev/data/fineweb.py
@@ -0,0 +1,100 @@
+"""
+FineWeb dataset (for srs pretraining)
+https://huggingface.co/datasets/HuggingFaceFW/fineweb
+
+example doc to highlight the structure of the dataset:
+{
+  "text": "Posted by mattsmith on 20th April 2012\nStraight from...",
+  "id": "<urn:uuid:d853d453-196e-4488-a411-efc2b26c40d2>",
+  "dump": "CC-MAIN-2013-20",
+  "url": "http://nleastchatter.com/philliesphandom/tag/freddy-galvis/",
+  "date": "2013-05-18T07:24:47Z",
+  "file_path": "s3://commoncrawl/long.../path.../file.gz",
+  "language": "en",
+  "language_score": 0.9185474514961243,
+  "token_count": 594
+}
+"""
+import os
+import argparse
+import multiprocessing as mp
+import numpy as np
+import tiktoken
+# from huggingface_hub import snapshot_download
+from datasets import load_dataset
+from tqdm import tqdm
+import argparse
+
+from data_common import write_datafile
+# ------------------------------------------
+
+parser = argparse.ArgumentParser(description="FineWeb dataset preprocessing")
+parser.add_argument("-s", "--shard_size", type=int, default=10**8, help="Size of each shard in tokens")
+args = parser.parse_args()
+
+# create the cache directory if it doesn't exist yet
+DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "fineweb10B")
+os.makedirs(DATA_CACHE_DIR, exist_ok=True)
+
+# todo is this needed? or just the load_dataset below?
+# download 10B Tokens sample (~28GB on disk)
+# folder = snapshot_download(
+#     "HuggingFaceFW/fineweb",
+#     repo_type="dataset",
+#     local_dir="./data/fineweb/",
+#     allow_patterns="sample/10BT/*"
+# )
+fw = load_dataset("HuggingFaceFW/fineweb", name="sample-10BT", split="train")
+
+# init the tokenizer
+enc = tiktoken.get_encoding("gpt2")
+eot = enc._special_tokens['<|endoftext|>'] # end of text token
+
+# helper functions
+def tokenize(doc):
+    # validate tokens in individual threads
+    tokens = np.array([eot] + enc.encode_ordinary(doc["text"]))
+    assert (0 <= tokens).all() and (tokens < 2**16).all(), "token dictionary too large for uint16"
+    return tokens.astype(np.uint16)
+
+# don't hog the entire system
+nprocs = max(1, os.cpu_count() - 2)
+
+# main loop write files
+with mp.Pool(nprocs) as pool:
+    shard_index = 0
+    # preallocate buffer to hold current shard
+    all_tokens_np = np.empty((args.shard_size,), dtype=np.uint16)
+    token_count = 0
+    progress_bar = None
+    for tokens in pool.imap(tokenize, fw, chunksize=16):
+        # enough space to add this document fully?
+        if token_count+len(tokens) < args.shard_size:
+            all_tokens_np[token_count:token_count+len(tokens)] = tokens
+            token_count += len(tokens)
+
+            # update progress bar
+            if progress_bar is None:
+                progress_bar = tqdm(total=args.shard_size, unit="tokens", desc=f"Shard {shard_index}")
+            progress_bar.update(len(tokens))
+        else:
+            split = "val" if shard_index == 0 else "train"
+            filename = os.path.join(DATA_CACHE_DIR, f"fineweb_{split}_{shard_index:06d}.bin")
+
+            # split the last document
+            remainder = args.shard_size - token_count
+            progress_bar.update(remainder)
+            all_tokens_np[token_count:token_count+remainder] = tokens[:remainder]
+            write_datafile(filename, all_tokens_np)
+            shard_index += 1
+            progress_bar = None
+
+            # populate the next shard with the leftovers of the current doc
+            all_tokens_np[0:len(tokens)-remainder] = tokens[remainder:]
+            token_count = len(tokens)-remainder
+
+    # write any remaining tokens as the last shard
+    if token_count != 0:
+        split = "val" if shard_index == 0 else "train"
+        filename = os.path.join(DATA_CACHE_DIR, f"fineweb_{split}_{shard_index:06d}.bin")
+        write_datafile(filename, all_tokens_np[:token_count])
diff --git a/dev/data/hellaswag.py b/dev/data/hellaswag.py
new file mode 100644
index 000000000..f4395e978
--- /dev/null
+++ b/dev/data/hellaswag.py
@@ -0,0 +1,174 @@
+"""
+Downloads and evaluates HellaSwag in Python.
+This then acts as the reference file for llm.c
+Also writes the data (tokens, labels) to .bin files for parallel evaluation in C.
+https://github.com/rowanz/hellaswag
+
+Example HellaSwag json item:
+
+{"ind": 24, "activity_label": "Roof shingle removal", "ctx_a": "A man is sitting on a roof.", "ctx_b": "he", "ctx": "A man is sitting on a roof. he", "split": "val", "split_type": "indomain", "label": 3, "endings": ["is using wrap to wrap a pair of skis.", "is ripping level tiles off.", "is holding a rubik's cube.", "starts pulling up roofing on a roof."], "source_id": "activitynet~v_-JhWjGDPHMY"}
+
+ind: dataset ID
+activity_label: The ActivityNet or WikiHow label for this example
+context: There are two formats. The full context is in ctx. When the context ends in an (incomplete) noun phrase, like for ActivityNet, this incomplete noun phrase is in ctx_b, and the context up until then is in ctx_a. This can be useful for models such as BERT that need the last sentence to be complete. However, it's never required. If ctx_b is nonempty, then ctx is the same thing as ctx_a, followed by a space, then ctx_b.
+endings: a list of 4 endings. The correct index is given by label (0,1,2, or 3)
+split: train, val, or test.
+split_type: indomain if the activity label is seen during training, else zeroshot
+source_id: Which video or WikiHow article this example came from
+
+gpt2 (124M)
+- eleuther harness reports acc 28.92%, acc_norm 31.14% (multiple choice style)
+- this script: 10042 acc: 0.2859 acc_norm: 0.2955 (completion style)
+
+gpt2-xl (1558M)
+- eleuther harness reports acc 40.04%, acc_norm 50.89% (multiple choice style)
+- this script: 10042 acc: 0.3842 acc_norm: 0.4893 (completion style)
+
+The validation set of HellaSwag has a total of 10,042 examples.
+"""
+
+import os
+import json
+import requests
+import tiktoken
+from tqdm import tqdm
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from transformers import GPT2LMHeadModel
+from data_common import download_file, write_evalfile
+
+# -----------------------------------------------------------------------------
+DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "hellaswag")
+
+hellaswags = {
+    "train": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_train.jsonl",
+    "val": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl",
+    "test": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_test.jsonl",
+}
+
+enc = tiktoken.get_encoding("gpt2")
+
+def download(split):
+    """Downloads HellaSwag DATA_CACHE_DIR"""
+    os.makedirs(DATA_CACHE_DIR, exist_ok=True)
+    data_url = hellaswags[split]
+    data_filename = os.path.join(DATA_CACHE_DIR, f"hellaswag_{split}.jsonl")
+    if not os.path.exists(data_filename):
+        print(f"Downloading {data_url} to {data_filename}...")
+        download_file(data_url, data_filename)
+    else:
+        print(f"{data_filename} already exists, skipping download...")
+
+def render_example(example):
+    """
+    Given the example as a dictionary, render it as three torch tensors:
+    - tokens (the tokens of context + completion, of size 4xN, as there are always 4 candidates)
+    - mask (is 1 in the region of the candidate completion, where we evaluate likelihoods)
+    - label (the index of the correct completion, which we hope has the highest likelihood)
+    """
+    ctx = example["ctx"]
+    label = example["label"]
+    endings = example["endings"]
+
+    # data needed to reproduce this eval on the C size
+    data = {
+        "label": label,
+        "ctx_tokens": None,
+        "ending_tokens": [],
+    }
+
+    # gather up all the tokens
+    ctx_tokens = enc.encode(ctx)
+    data["ctx_tokens"] = ctx_tokens
+    tok_rows = []
+    mask_rows = []
+    for end in endings:
+        end_tokens = enc.encode(" " + end) # note: prepending " " because GPT-2 tokenizer
+        tok_rows.append(ctx_tokens + end_tokens)
+        mask_rows.append([0]*len(ctx_tokens) + [1]*len(end_tokens))
+        data["ending_tokens"].append(end_tokens)
+
+    # have to be careful during the collation because the number of tokens in each row can differ
+    max_len = max(len(row) for row in tok_rows)
+    tokens = torch.zeros((4, max_len), dtype=torch.long)
+    mask = torch.zeros((4, max_len), dtype=torch.long)
+    for i, (tok_row, mask_row) in enumerate(zip(tok_rows, mask_rows)):
+        tokens[i, :len(tok_row)] = torch.tensor(tok_row)
+        mask[i, :len(mask_row)] = torch.tensor(mask_row)
+
+    return data, tokens, mask, label
+
+def iterate_examples(split):
+    # there are 10,042 examples in total in val
+    download(split)
+    with open(os.path.join(DATA_CACHE_DIR, f"hellaswag_{split}.jsonl"), "r") as f:
+        for line in f:
+            example = json.loads(line)
+            yield example
+
+@torch.no_grad()
+def evaluate(model_type, device):
+
+    torch.set_float32_matmul_precision('high') # use tf32
+
+    model = GPT2LMHeadModel.from_pretrained(model_type)
+    model.to(device)
+    # model = torch.compile(model)
+
+    datas = []
+    num_correct_norm = 0
+    num_correct = 0
+    num_total = 0
+    for example in iterate_examples("val"):
+        data, tokens, mask, label = render_example(example)
+        datas.append(data)
+        tokens = tokens.to(device)
+        mask = mask.to(device)
+
+        # get the logits
+        logits = model(tokens).logits
+        # evaluate the autoregressive loss at all positions
+        shift_logits = (logits[..., :-1, :]).contiguous()
+        shift_tokens = (tokens[..., 1:]).contiguous()
+        flat_shift_logits = shift_logits.view(-1, shift_logits.size(-1))
+        flat_shift_tokens = shift_tokens.view(-1)
+        shift_losses = F.cross_entropy(flat_shift_logits, flat_shift_tokens, reduction='none')
+        shift_losses = shift_losses.view(tokens.size(0), -1)
+        # now get the average loss just for the completion region (where mask == 1), in each row
+        shift_mask = (mask[..., 1:]).contiguous() # we must shift mask, so we start at the last prompt token
+        masked_shift_losses = shift_losses * shift_mask
+        # sum and divide by the number of 1s in the mask
+        sum_loss = masked_shift_losses.sum(dim=1)
+        avg_loss = sum_loss / shift_mask.sum(dim=1)
+        # now we have a loss for each of the 4 completions
+        # the one with the lowest loss should be the most likely
+        pred = sum_loss.argmin().item()
+        pred_norm = avg_loss.argmin().item()
+
+        # accumulate stats
+        num_total += 1
+        num_correct += int(pred == label)
+        num_correct_norm += int(pred_norm == label)
+        print(f"{num_total} acc: {num_correct/num_total:.4f} acc_norm: {num_correct_norm}/{num_total}={num_correct_norm/num_total:.4f}")
+
+        # debug: pretty print a few examples, and the losses in each case
+        if num_total < 10:
+            print("---")
+            print(f"Context:\n {example['ctx']}")
+            print(f"Endings:")
+            for i, end in enumerate(example["endings"]):
+                print(f"{i} (loss: {avg_loss[i].item():.4f}) {end}")
+            print(f"predicted: {pred_norm}, actual: {label}")
+
+    # now write the data to a .bin file
+    filename = os.path.join(DATA_CACHE_DIR, f"hellaswag_val.bin")
+    write_evalfile(filename, datas)
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-m", "--model_type", type=str, default="gpt2", help="the model type to use")
+    parser.add_argument("-d", "--device", type=str, default="cuda", help="the device to use")
+    args = parser.parse_args()
+    evaluate(args.model_type, args.device)
diff --git a/dev/data/mmlu.py b/dev/data/mmlu.py
new file mode 100644
index 000000000..bda8855b8
--- /dev/null
+++ b/dev/data/mmlu.py
@@ -0,0 +1,147 @@
+"""
+Downloads and evaluates MMLU in Python.
+This then acts as the reference file for llm.c
+https://github.com/hendrycks/test
+
+gpt2 (124M)
+- this script: 14042 acc: 0.2557 acc_norm: 0.2721
+
+gpt2-xl (1558M)
+- this script: 14042 acc: 0.2927 acc_norm: 0.3035
+"""
+
+import os
+import requests
+import tiktoken
+import pandas as pd
+from tqdm import tqdm
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from transformers import GPT2LMHeadModel
+from data_common import download_file
+
+# -----------------------------------------------------------------------------
+DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "mmlu")
+
+enc = tiktoken.get_encoding("gpt2")
+data_url = "https://people.eecs.berkeley.edu/~hendrycks/data.tar"
+
+def download():
+    """Downloads MMLU to DATA_CACHE_DIR"""
+    os.makedirs(DATA_CACHE_DIR, exist_ok=True)
+    data_filename = os.path.join(DATA_CACHE_DIR, f"data.tar")
+    if not os.path.exists(data_filename):
+        print(f"Downloading {data_url} to {data_filename}...")
+        download_file(data_url, data_filename)
+        os.system(f"tar -xf {data_filename} -C {DATA_CACHE_DIR}") # untar
+        # creates a directory "data" inside it, with e.g. data/test/*csv
+    else:
+        print(f"{data_filename} already exists, skipping download...")
+
+def iterate_examples():
+    # there are 14,042 examples in total in the test set
+
+    download()
+    test_dir = os.path.join(DATA_CACHE_DIR, "data", "test")
+    csv_files = [f for f in os.listdir(test_dir) if f.endswith(".csv")]
+    for csv_file in csv_files:
+        csv_path = os.path.join(test_dir, csv_file)
+        print(csv_path)
+        df = pd.read_csv(csv_path, header=None)
+        n = df.shape[0]
+        for idx in range(n):
+            example = {
+                "question": df.iloc[idx, 0],
+                "endings": [df.iloc[idx, 1], df.iloc[idx, 2], df.iloc[idx, 3], df.iloc[idx, 4]],
+                "label": df.iloc[idx, 5],
+            }
+            yield example
+
+def render_example(example):
+    """
+    Given the example as a dictionary, render it as three torch tensors:
+    - tokens (the tokens of context + completion, of size 4xN, as there are always 4 candidates)
+    - mask (is 1 in the region of the candidate completion, where we evaluate likelihoods)
+    - label (the index of the correct completion, which we hope has the highest likelihood)
+    """
+    ctx = f"Question: {example['question']}\n\nAnswer:"
+    ctx_tokens = enc.encode(ctx)
+
+    tok_rows = []
+    mask_rows = []
+    for end in example["endings"]:
+        end_tokens = enc.encode(" " + str(end)) # note: prepending " " because GPT-2 tokenizer
+        tok_rows.append(ctx_tokens + end_tokens)
+        mask_rows.append([0]*len(ctx_tokens) + [1]*len(end_tokens))
+
+    # have to be careful during the collation because the number of tokens in each row can differ
+    max_len = max(len(row) for row in tok_rows)
+    tokens = torch.zeros((4, max_len), dtype=torch.long)
+    mask = torch.zeros((4, max_len), dtype=torch.long)
+    for i, (tok_row, mask_row) in enumerate(zip(tok_rows, mask_rows)):
+        tokens[i, :len(tok_row)] = torch.tensor(tok_row)
+        mask[i, :len(mask_row)] = torch.tensor(mask_row)
+
+    label = "ABCD".index(example["label"])
+    return tokens, mask, label
+
+@torch.no_grad()
+def evaluate(model_type, device):
+
+    torch.set_float32_matmul_precision('high') # use tf32
+
+    model = GPT2LMHeadModel.from_pretrained(model_type)
+    model.to(device)
+    # model = torch.compile(model)
+
+    num_correct_norm = 0
+    num_correct = 0
+    num_total = 0
+    for example in iterate_examples():
+        tokens, mask, label = render_example(example)
+        tokens = tokens.to(device)
+        mask = mask.to(device)
+
+        # get the logits
+        logits = model(tokens).logits
+        # evaluate the autoregressive loss at all positions
+        shift_logits = (logits[..., :-1, :]).contiguous()
+        shift_tokens = (tokens[..., 1:]).contiguous()
+        flat_shift_logits = shift_logits.view(-1, shift_logits.size(-1))
+        flat_shift_tokens = shift_tokens.view(-1)
+        shift_losses = F.cross_entropy(flat_shift_logits, flat_shift_tokens, reduction='none')
+        shift_losses = shift_losses.view(tokens.size(0), -1)
+        # now get the average loss just for the completion region (where mask == 1), in each row
+        shift_mask = (mask[..., 1:]).contiguous() # we must shift mask, so we start at the last prompt token
+        masked_shift_losses = shift_losses * shift_mask
+        # sum and divide by the number of 1s in the mask
+        sum_loss = masked_shift_losses.sum(dim=1)
+        avg_loss = sum_loss / shift_mask.sum(dim=1)
+        # now we have a loss for each of the 4 completions
+        # the one with the lowest loss should be the most likely
+        pred = sum_loss.argmin().item()
+        pred_norm = avg_loss.argmin().item()
+
+        # accumulate stats
+        num_total += 1
+        num_correct += int(pred == label)
+        num_correct_norm += int(pred_norm == label)
+        print(f"{num_total} acc: {num_correct/num_total:.4f} acc_norm: {num_correct_norm/num_total:.4f}")
+
+        # debug prints
+        if num_total < 10:
+            print("---")
+            print(f"Context:\n {example['question']}")
+            print(f"Endings:")
+            for i, end in enumerate(example["endings"]):
+                print(f"{i} (loss: {avg_loss[i].item():.4f}) {end}")
+            print(f"predicted: {pred}, actual: {label}")
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-m", "--model_type", type=str, default="gpt2", help="the model type to use")
+    parser.add_argument("-d", "--device", type=str, default="cuda", help="the device to use")
+    args = parser.parse_args()
+    evaluate(args.model_type, args.device)
diff --git a/prepro_tinyshakespeare.py b/dev/data/tinyshakespeare.py
similarity index 55%
rename from prepro_tinyshakespeare.py
rename to dev/data/tinyshakespeare.py
index a5d562284..d9b4b6e22 100644
--- a/prepro_tinyshakespeare.py
+++ b/dev/data/tinyshakespeare.py
@@ -3,11 +3,11 @@
 - The download is from Github.
 - The tokenization is GPT-2 tokenizer with tiktoken
 
-The output is written to a newly created data/ folder.
+The output is written to a newly created tinyshakespeare/ folder.
 The script prints:
 
-Saved 32768 tokens to data/tiny_shakespeare_val.bin
-Saved 305260 tokens to data/tiny_shakespeare_train.bin
+Saved 32768 tokens to tinyshakespeare/tiny_shakespeare_val.bin
+Saved 305260 tokens to tinyshakespeare/tiny_shakespeare_train.bin
 
 And runs in a few seconds depending on your internet
 connection and computer. The .bin files are raw byte
@@ -15,36 +15,20 @@
 """
 
 import os
-import requests
-from tqdm import tqdm
-
 import tiktoken
 import numpy as np
+from data_common import download_file, write_datafile
+
+# -----------------------------------------------------------------------------
+DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "tinyshakespeare")
 
-DATA_CACHE_DIR = "data"
 enc = tiktoken.get_encoding("gpt2")
 encode = lambda s: enc.encode(s, allowed_special={'<|endoftext|>'})
 
-def download_file(url: str, fname: str, chunk_size=1024):
-    """Helper function to download a file from a given url"""
-    resp = requests.get(url, stream=True)
-    total = int(resp.headers.get("content-length", 0))
-    with open(fname, "wb") as file, tqdm(
-        desc=fname,
-        total=total,
-        unit="iB",
-        unit_scale=True,
-        unit_divisor=1024,
-    ) as bar:
-        for data in resp.iter_content(chunk_size=chunk_size):
-            size = file.write(data)
-            bar.update(size)
-
 def download():
     """Downloads the TinyShakespeare dataset to DATA_CACHE_DIR"""
     os.makedirs(DATA_CACHE_DIR, exist_ok=True)
-
-    # download the TinyStories dataset, unless it's already downloaded
+    # download the TinyShakespeare dataset, unless it's already downloaded
     data_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
     data_filename = os.path.join(DATA_CACHE_DIR, "tiny_shakespeare.txt")
     if not os.path.exists(data_filename):
@@ -54,7 +38,6 @@ def download():
         print(f"{data_filename} already exists, skipping download...")
 
 def tokenize():
-    eot = enc._special_tokens['<|endoftext|>'] # end of text token
     data_filename = os.path.join(DATA_CACHE_DIR, "tiny_shakespeare.txt")
     text = open(data_filename, 'r').read()
     # let's treat every person's statement in the dialog as a separate document
@@ -62,20 +45,14 @@ def tokenize():
     text = text.replace('\n\n', '\n\n<|endoftext|>')
     # encode the text
     tokens = encode(text)
-    tokens_np = np.array(tokens, dtype=np.int32)
     # let's take the first 32,768 tokens as the validation split (~10%)
-    val_tokens_np = tokens_np[:32768]
-    train_tokens_np = tokens_np[32768:]
+    val_tokens = tokens[:32768]
+    train_tokens = tokens[32768:]
     # save to file
     val_filename = os.path.join(DATA_CACHE_DIR, "tiny_shakespeare_val.bin")
     train_filename = os.path.join(DATA_CACHE_DIR, "tiny_shakespeare_train.bin")
-    with open(val_filename, "wb") as f:
-        f.write(val_tokens_np.tobytes())
-    with open(train_filename, "wb") as f:
-        f.write(train_tokens_np.tobytes())
-    # prints
-    print(f"Saved {len(val_tokens_np)} tokens to {val_filename}")
-    print(f"Saved {len(train_tokens_np)} tokens to {train_filename}")
+    write_datafile(val_filename, val_tokens)
+    write_datafile(train_filename, train_tokens)
 
 if __name__ == "__main__":
     download()
diff --git a/prepro_tinystories.py b/dev/data/tinystories.py
similarity index 77%
rename from prepro_tinystories.py
rename to dev/data/tinystories.py
index 8f2c1e8ad..fed8bc61c 100644
--- a/prepro_tinystories.py
+++ b/dev/data/tinystories.py
@@ -3,13 +3,13 @@
 - The download is from HuggingFace datasets.
 - The tokenization is GPT-2 tokenizer with tiktoken
 
-The output is written to a newly created data/ folder.
+The output is written to a newly created tinystories/ folder.
 The script prints:
 
 Tokenizing val split...
-Saved 19043638 tokens to data/TinyStories_val.bin
+Saved 19043638 tokens to tinystories/TinyStories_val.bin
 Tokenizing train split...
-Saved 925653391 tokens to data/TinyStories_train.bin
+Saved 925653391 tokens to tinystories/TinyStories_train.bin
 
 And runs in 1-2 minutes two depending on your internet
 connection and computer. The .bin files are raw byte
@@ -23,29 +23,16 @@
 import requests
 from tqdm import tqdm
 from concurrent.futures import ProcessPoolExecutor, as_completed
-
 import tiktoken
 import numpy as np
+from data_common import download_file, write_datafile
+
+# -----------------------------------------------------------------------------
+DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "tinystories")
 
-DATA_CACHE_DIR = "data"
 enc = tiktoken.get_encoding("gpt2")
 encode = lambda s: enc.encode_ordinary(s)
 
-def download_file(url: str, fname: str, chunk_size=1024):
-    """Helper function to download a file from a given url"""
-    resp = requests.get(url, stream=True)
-    total = int(resp.headers.get("content-length", 0))
-    with open(fname, "wb") as file, tqdm(
-        desc=fname,
-        total=total,
-        unit="iB",
-        unit_scale=True,
-        unit_divisor=1024,
-    ) as bar:
-        for data in resp.iter_content(chunk_size=chunk_size):
-            size = file.write(data)
-            bar.update(size)
-
 def download():
     """Downloads the TinyStories dataset to DATA_CACHE_DIR"""
     os.makedirs(DATA_CACHE_DIR, exist_ok=True)
@@ -70,11 +57,11 @@ def download():
 
     # print a single example just for debugging and such
     shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
-    with open(shard_filenames[0], "r") as f:
-        data = json.load(f)
     print("Download done.")
     print(f"Number of shards: {len(shard_filenames)}")
-    #print(f"Example story:\n{data[0]}")
+    # with open(shard_filenames[0], "r") as f:
+    #     data = json.load(f)
+    # print(f"Example story:\n{data[0]}")
 
 def process_shard(shard_index, shard_filename):
     with open(shard_filename, "r") as f:
@@ -107,11 +94,8 @@ def tokenize():
             for future in as_completed(futures):
                 all_tokens.extend(future.result())
 
-        all_tokens_np = np.array(all_tokens, dtype=np.int32)
         split_filename = os.path.join(DATA_CACHE_DIR, f"TinyStories_{split_name}.bin")
-        with open(split_filename, "wb") as f:
-            f.write(all_tokens_np.tobytes())
-        print(f"Saved {len(all_tokens_np)} tokens to {split_filename}")
+        write_datafile(split_filename, all_tokens)
 
 if __name__ == "__main__":
     download()
diff --git a/dev/unistd.h b/dev/unistd.h
index 18efc2206..5569d7df3 100644
--- a/dev/unistd.h
+++ b/dev/unistd.h
@@ -5,22 +5,103 @@
 #define _CRT_SECURE_NO_WARNINGS
 #define _USE_MATH_DEFINES
 
+#include <stdio.h>
 #include <math.h>
 //#define gen_max_length 64 // compile as C++ to skip this VLA issue
 #include <time.h>
 
 #define CLOCK_MONOTONIC 0
-int clock_gettime(int ignore_variable, struct timespec* tv)
+static inline int clock_gettime(int ignore_variable, struct timespec* tv)
 {
     return timespec_get(tv, TIME_UTC); // TODO: not sure this is the best solution. Need to review.
 }
 
 #define OMP /* turn it on */
-#include  <io.h> /* needed for access below */
+#include <io.h> /* needed for access below */
 #define F_OK 0
 #define access _access
 
 #define TURN_OFF_FP_FAST __pragma(float_control( precise, on, push )) // Save current setting and turn on /fp:precise
 #define TURN_ON_FP_FAST  __pragma(float_control(pop)) // Restore file's default settings
 
+#include <direct.h> /* for _mkdir and _stat */
+#define mkdir(path, mode) _mkdir(path) /* sketchy way to get mkdir to work on windows */
+#define stat _stat
+
+typedef struct glob_t {
+    size_t gl_pathc;    // Count of matched pathnames
+    char **gl_pathv;    // List of matched pathnames
+} glob_t;
+
+static inline void replace_forward_slashes(char* str) {
+    while (*str) {
+        if (*str == '/') {
+            *str = '\\';
+        }
+        str++;
+    }
+}
+
+static inline void globfree(glob_t *pglob) {
+    for (size_t i = 0; i < pglob->gl_pathc; ++i) {
+        free(pglob->gl_pathv[i]); // Free the allocated memory for each filename
+    }
+    free(pglob->gl_pathv); // Free the allocated memory for the list of filenames
+}
+
+static inline int glob(const char* pattern, int ignored_flags, int (*ignored_errfunc)(const char* epath, int eerrno), glob_t* pglob){
+    struct _finddata_t find_file_data;
+    char full_path[576]; // stored in pglob->gl_pathv[n]
+    char directory_path[512] = {0}; // Store the directory path from the pattern
+    char pattern_copy[512]; // Copy of the pattern to modify
+
+    strncpy_s(pattern_copy, sizeof(pattern_copy) - 1, pattern, sizeof(pattern_copy) - 1);
+
+    replace_forward_slashes (pattern_copy); // Replace forward slashes with backslashes
+
+    if (strchr(pattern_copy, '\\') != NULL) {
+        strncpy_s(directory_path, sizeof(directory_path) - 1, pattern_copy, strrchr(pattern_copy, '\\') - pattern_copy + 1);
+        directory_path[strrchr(pattern_copy, '\\') - pattern_copy + 1] = '\0';
+    }
+
+    // find the first file matching the pattern in the directory
+    intptr_t find_handle = _findfirst(pattern_copy, &find_file_data);
+
+    if (find_handle == -1) {
+        return 1; // No files found
+    }
+
+    size_t file_count = 0;
+    size_t max_files = 64000; // hard-coded limit for the number of files
+
+    pglob->gl_pathv = (char **) malloc(max_files * sizeof(char*)); // freed in globfree
+
+    if (pglob->gl_pathv == NULL) {
+        _findclose(find_handle);
+        return 2; // Memory allocation failed
+    }
+
+    do {
+        if (file_count >= max_files) {
+            _findclose(find_handle);
+            return 2; // Too many files found
+            }
+
+        snprintf(full_path, sizeof(full_path), "%s%s", directory_path, find_file_data.name);
+
+        pglob->gl_pathv[file_count] = _strdup(full_path); // freed in globfree
+
+        if (pglob->gl_pathv[file_count] == NULL) {
+            _findclose(find_handle);
+            return 2; // Memory allocation for filename failed
+        }
+        file_count++;
+    } while (_findnext(find_handle, &find_file_data) == 0);
+
+    _findclose(find_handle);
+
+    pglob->gl_pathc = file_count;
+    return 0;
+}
+
 #endif
diff --git a/profile_gpt2.cu b/profile_gpt2.cu
index 58159e851..f79e9ada4 100644
--- a/profile_gpt2.cu
+++ b/profile_gpt2.cu
@@ -27,42 +27,9 @@ the profile.ncu-rep from a cloud box to local to pretty view.
 #define TESTING
 #include "train_gpt2.cu"
 
-int main() {
-
-    // set up the device
-    int deviceIdx = 0;
-    cudaCheck(cudaSetDevice(deviceIdx));
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, deviceIdx);
-    printf("[System]\n");
-    printf("Device %d: %s\n", deviceIdx, deviceProp.name);
-
-    cuda_num_SMs = deviceProp.multiProcessorCount;
-    cuda_threads_per_SM = deviceProp.maxThreadsPerMultiProcessor;
-    cuda_arch_major = deviceProp.major;
-    cuda_arch_minor = deviceProp.minor;
-
-    cudaCheck(cudaStreamCreate(&main_stream));
-    cudaEventCreateWithFlags(&main_event, cudaEventDisableTiming);
-    cudaEventCreateWithFlags(&loss_event, cudaEventDisableTiming);
-    for (int i = 0; i < num_parallel_streams; i++) {
-        cudaCheck(cudaStreamCreate(&parallel_streams[i]));
-        cudaEventCreateWithFlags(&parallel_events[i], cudaEventDisableTiming);
-    }
-
-    // setup cuBLAS and cuBLASLt
-    cublasCheck(cublasCreate(&cublas_handle));
-    cublasCheck(cublasSetStream(cublas_handle, main_stream));
-    cublasCheck(cublasLtCreate(&cublaslt_handle));
-    // TF32 precision is equivalent to torch.set_float32_matmul_precision('high')
-    int enable_tf32 = deviceProp.major >= 8 ? 1 : 0;
-    printf("enable_tf32: %d\n", enable_tf32);
-    cublas_compute_type = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F;
-    cublasMath_t cublas_math_mode = enable_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH;
-    cublasCheck(cublasSetMathMode(cublas_handle, cublas_math_mode));
-    // setup the (global) cuBLASLt workspace
-    cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size));
-    create_cudnn();
+int main(int argc, char *argv[]) {
+    multi_gpu_config = multi_gpu_config_init(&argc, &argv);
+    common_start(true, true);
 
     // build the GPT-2 model from a checkpoint
     GPT2 model;
@@ -82,20 +49,16 @@ int main() {
 
     // override number of layers to 1 because all layers repeat the same kernels, only profile once
     model.config.num_layers = 1;
+    set_zero_configs(&multi_gpu_config, 0, model.num_parameters);
 
     // do a training step
     gpt2_forward(&model, x, y, B, T);
     gpt2_zero_grad(&model);
-    gpt2_backward(&model);
-    gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1);
+    gpt2_backward(&model, x);
+    gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1.f, 1, &multi_gpu_config);
     cudaCheck(cudaDeviceSynchronize()); // finish all CUDA work to get correct precise timings
 
     // free
-    gpt2_free(&model);
-    destroy_cudnn();
-    cudaCheck(cudaFree(cublaslt_workspace));
-    cublasCheck(cublasDestroy(cublas_handle));
-    cublasCheck(cublasLtDestroy(cublaslt_handle));
-
+    common_free(model);
     return 0;
 }
diff --git a/profile_gpt2cu.py b/profile_gpt2cu.py
index 8e8847369..de2edfda9 100644
--- a/profile_gpt2cu.py
+++ b/profile_gpt2cu.py
@@ -39,7 +39,8 @@
     "dram__bytes_write.sum",                    # DRAM writes
     "lts__t_sectors_srcunit_tex_op_read.sum",   # L2 reads (sectors -- 32B)
     "lts__t_sectors_srcunit_tex_op_write.sum",  # L2 reads (sectors -- 32B)
-    "smsp__inst_executed.sum",                   # instructions
+    "sm__pipe_tensor_op_hmma_cycles_active.avg.pct_of_peak_sustained_active", # % of peak tensor core utilization
+    "smsp__inst_executed.sum",                  # instructions
 ]
 cmd = [NCU, "-i", "profile.ncu-rep", "--csv", "--page", "raw", "--metrics", ",".join(metrics)]
 result = subprocess.check_output(cmd, text=True).strip()
@@ -72,16 +73,35 @@
 
 assert CLS_START != -1
 
+# Check every kernel to find the maximum DRAM bandwidth and Tensor Core utilisation values
+max_dram_bw = 0.0
+max_tensor = 0.0
+for rid, row in kernel_profile_data:
+    if rid <= 2:
+        continue
+    time = float(row[13])
+    read = float(row[11])
+    write = float(row[12])
+    tensor = float(row[16])
+    dram_bw = (read + write) / (time / 1000.0)
+    max_dram_bw = max(max_dram_bw, dram_bw)
+    max_tensor = max(max_tensor, tensor)
+
+# round the maximum tensor core utilisation to 50% or 100%
+# consumer GPUs can only achieve 50% of peak tensor throughput on this counter
+# and for GPUs without tensor cores, we set the value to 50% to avoid division by zero
+max_tensor = (max_tensor > 50.0) and 100.0 or 50.0
+
 print()
 print("Kernel calls:")
 for rid, row in kernel_profile_data:
     if rid == 0:
         #  headings
-        print(f"id pass    {'name':<40} {'time':>8} {'RAM rd':>8} {'RAM wt':>8} {'L2 rd':>8} {'L2 wt':>8} {'inst':>8}")
+        print(  f"id pass    {'name':<40} {'time':>8} {'RAM BW':>8} {'tensor':>8} {'RAM rd':>8} {'RAM wt':>8} {'L2 rd':>8} {'L2 wt':>8} {'inst':>8}")
         continue
     if rid == 1:
         # units
-        units = f"           {'':<40} {'ms':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'MInst':>8}"
+        units = f"           {'':<40} {'ms':>8} {'GB/s':>8} {'core %':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'MInst':>8}"
         print(units)
         print("." * len(units))
         continue
@@ -95,7 +115,9 @@
     write = float(row[12])
     l2_read = float(row[14])
     l2_write = float(row[15])
-    inst = float(row[16]) / 1e6
+    tensor = float(row[16])
+    inst = float(row[17]) / 1e6
+    dram_bw = (read + write) / (time / 1000.0)
 
     kid = rid - 2
 
@@ -108,7 +130,7 @@
         # the classifier part, counts only once
         pass_name = "cls"
         phase = "bwd"
-    elif "adamw" in kernel:
+    elif "adamw" in kernel or "global_norm" in kernel:
         # encoder layer or adam
         pass_name = "opt"
     # before the first optimizer run, we create weight copies.
@@ -149,6 +171,7 @@
     l2_read = l2_read * 32 / 1024 / 1024 / 1024
     l2_write = l2_write * 32 / 1024 / 1024 / 1024
 
+    efficiency = max(dram_bw / max_dram_bw, tensor / max_tensor)
     summaries[fn_name] += time
     counts[fn_name] += multiplier
     passes[pass_name] += time
@@ -159,13 +182,18 @@
         total['l2_read'] += l2_read
         total['l2_write'] += l2_write
         total['inst'] += inst
+        total['tensor'] += tensor * time # % so multiplied by time
+        total['efficiency'] += efficiency * time
 
     pass_info = f"{pass_name}×{multiplier}"
-    print(f"{kid:02} {pass_info:7} {fn_name:<40} {time:8.2f} {read:8.2f} {write:8.2f} {l2_read:8.2f} {l2_write:8.2f} {inst:8.2f}")
+    print(f"{kid:02} {pass_info:7} {fn_name:<40} {time:8.2f} {dram_bw:8.1f} {tensor:8.1f} {read:8.2f} {write:8.2f} {l2_read:8.2f} {l2_write:8.2f} {inst:8.2f}")
+
 
 total_time = total['time']
+avg_dram_bw = (total['read'] + total['write']) / (total_time / 1000.0)
+avg_tensor_util = total['tensor'] / total_time
 print("." * len(units))
-print(f"           {'Total':<40} {total['time']:8.2f} {total['read']:8.2f} {total['write']:8.2f} {total['l2_read']:8.2f} {total['l2_write']:8.2f} {total['inst']:8.2f}")
+print(f"           {'Total':<40} {total['time']:8.2f} {avg_dram_bw:8.1f} {avg_tensor_util:8.1f} {total['read']:8.2f} {total['write']:8.2f} {total['l2_read']:8.2f} {total['l2_write']:8.2f} {total['inst']:8.2f}")
 
 print()
 print("Kernel type summaries:")
@@ -192,5 +220,9 @@
 We read {total['read']:.1f}GiB ({total['read']/ts:.1f}GB/s) and write {total['write']:.1f}GiB ({total['write']/ts:.1f}GB/s) to DRAM,
 read {total['l2_read']:.1f}GiB ({total['l2_read']/ts:.1f}GB/s) and write {total['l2_write']:.1f}GiB ({total['l2_write']/ts:.1f}GB/s) to L2,
 and execute {total['inst'] / 1000:.1f} billion instructions ({total['inst'] / 1000 / ts:.1f} GInst/s).
+
+Assuming that every kernel should be either fully DRAM bandwidth or tensor core limited,
+with a peak DRAM bandwidth of {max_dram_bw:.1f}GB/s and a peak tensor throughput of {max_tensor:.1f}%,
+our overall efficiency is {(total['efficiency'] * 100.0 / total_time):.1f}%.
 """
 print(summary)
\ No newline at end of file
diff --git a/rand.h b/rand.h
new file mode 100644
index 000000000..e60e5e6a9
--- /dev/null
+++ b/rand.h
@@ -0,0 +1,223 @@
+/*
+Mersenne Twisters implementation, numerically identical to torch.
+
+Example usage:
+
+    mt19937_state state;
+    manual_seed(&state, 137);
+    printf("%u\n", randint32(&state));
+    printf("%u\n", randint32(&state));
+    printf("%u\n", randint32(&state));
+    printf("%u\n", randint32(&state));
+    printf("%u\n", randint32(&state));
+
+    float t8[8];
+    normal_(t8, 8, 0, 1, &state);
+    for (int i = 0; i < 8; i++) {
+        printf("%f\n", t8[i]);
+    }
+    printf("%u\n", randint32(&state));
+
+    float t16[16];
+    normal_(t16, 16, 0, 1, &state);
+    for (int i = 0; i < 16; i++) {
+        printf("%f\n", t16[i]);
+    }
+    printf("%u\n", randint32(&state));
+
+PyTorch reference (producing identical results):
+
+    import torch
+    torch.manual_seed(137)
+    print(torch.randint(0, 0xFFFFFFFF, [1]).item())
+    print(torch.randint(0, 0xFFFFFFFF, [1]).item())
+    print(torch.randint(0, 0xFFFFFFFF, [1]).item())
+    print(torch.randint(0, 0xFFFFFFFF, [1]).item())
+    print(torch.randint(0, 0xFFFFFFFF, [1]).item())
+    t = torch.zeros(8);
+    t.normal_()
+    for i in range(len(t)) :
+        print(t[i].item())
+    print(torch.randint(0, 0xFFFFFFFF, [1]).item())
+    t = torch.zeros(16);
+    t.normal_()
+    for i in range(len(t)) :
+        print(t[i].item())
+    print(torch.randint(0, 0xFFFFFFFF, [1]).item())
+
+Both output:
+
+    4053805790
+    2173880614
+    380293709
+    1237255315
+    2986595568
+    0.7947664260864258
+    1.4369317293167114
+    - 0.2292192131280899
+    0.47556325793266296
+    - 0.6334410905838013
+    - 0.5791953802108765
+    - 0.0925704762339592
+    - 0.8659197092056274
+    2186503452
+    - 1.2813878059387207
+    - 2.646395683288574
+    - 0.06569503247737885
+    0.2180829495191574
+    - 0.46536165475845337
+    - 0.33108410239219666
+    2.5485482215881348
+    0.10425379872322083
+    0.8460659980773926
+    0.9462448358535767
+    - 0.2913765013217926
+    0.34313806891441345
+    - 1.1186704635620117
+    - 0.18305328488349915
+    - 2.3153159618377686
+    0.3961987793445587
+    2756748748
+*/
+
+#ifndef RAND_H
+#define RAND_H
+
+#include <math.h>
+
+#define MERSENNE_STATE_M 397u
+#define MERSENNE_STATE_N 624u
+
+#define LMASK 0x7ffffffful
+#define UMASK 0x80000000ul
+
+// Copyright(c) Makoto Matsumoto and Takuji Nishimura
+
+// This implementation follows PyTorch so that we are numerically identical when running verification tests.
+
+typedef struct {
+    unsigned long long seed_;
+    int left_;
+    unsigned int next_;
+    unsigned int state_[MERSENNE_STATE_N];
+    unsigned int MATRIX_A[2];
+} mt19937_state;
+
+void manual_seed(mt19937_state* state, unsigned int seed) {
+    state->MATRIX_A[0] = 0x0u;
+    state->MATRIX_A[1] = 0x9908b0df;
+    state->state_[0] = seed & 0xffffffff;
+    for (unsigned int j = 1; j < MERSENNE_STATE_N; j++) {
+        state->state_[j] = 1812433253 * (state->state_[j - 1] ^ (state->state_[j - 1] >> 30)) + j;
+        state->state_[j] &= 0xffffffff;
+    }
+    state->left_ = 1;
+    state->next_ = 0;
+}
+
+void next_state(mt19937_state* state) {
+    state->left_ = MERSENNE_STATE_N;
+    state->next_ = 0;
+    unsigned int y, j;
+    for (j = 0; j < MERSENNE_STATE_N - MERSENNE_STATE_M; j++) {
+        y = (state->state_[j] & UMASK) | (state->state_[j + 1] & LMASK);
+        state->state_[j] = state->state_[j + MERSENNE_STATE_M] ^ (y >> 1) ^ state->MATRIX_A[y & 0x1];
+    }
+    for (; j < MERSENNE_STATE_N - 1; j++) {
+        y = (state->state_[j] & UMASK) | (state->state_[j + 1] & LMASK);
+        state->state_[j] = state->state_[j + (MERSENNE_STATE_M - MERSENNE_STATE_N)] ^ (y >> 1) ^ state->MATRIX_A[y & 0x1];
+    }
+    y = (state->state_[MERSENNE_STATE_N - 1] & UMASK) | (state->state_[0] & LMASK);
+    state->state_[MERSENNE_STATE_N - 1] = state->state_[MERSENNE_STATE_M - 1] ^ (y >> 1) ^ state->MATRIX_A[y & 0x1];
+}
+
+unsigned int randint32(mt19937_state* state) {
+    if (!state) return 0;
+    if (state->MATRIX_A[0] != 0 || state->MATRIX_A[1] != 0x9908b0df) manual_seed(state, 5489); // auto-initialize
+    if (--state->left_ <= 0) {
+        next_state(state);
+    }
+    unsigned int y = state->state_[state->next_++];
+    y ^= y >> 11;
+    y ^= (y << 7) & 0x9d2c5680;
+    y ^= (y << 15) & 0xefc60000;
+    y ^= y >> 18;
+    return y;
+}
+
+inline unsigned long long randint64(mt19937_state* state) {
+    return (((unsigned long long)(randint32(state)) << 32) | randint32(state));
+}
+
+inline float randfloat32(mt19937_state* state) {
+    return (randint32(state) & ((1ull << 24) - 1)) * (1.0f / (1ull << 24));
+}
+
+inline double randfloat64(mt19937_state* state) {
+    return (randint64(state) & ((1ull << 53) - 1)) * (1.0 / (1ull << 53));
+}
+
+void uniform_(float* data, unsigned int numel, float from, float to, mt19937_state* state) {
+    for (unsigned int t = 0; t < numel; t++) {
+        data[t] = randfloat32(state) * (to - from) + from;
+    }
+}
+
+// Box�Muller transform
+
+void normal_fill_16(float* data, float mean, float std, mt19937_state* state) {
+    #define EPSILONE 1e-12
+    for (unsigned int t = 0; t < 8; t++) {
+        float u1 = 1 - data[t];
+        float u2 = data[t + 8];
+        float radius = sqrtf(-2 * logf(u1 + EPSILONE));
+        float theta = 2.0 * M_PI * u2;
+        data[t] = (radius * cosf(theta) * std + mean);
+        data[t + 8] = (radius * sinf(theta) * std + mean);
+    }
+}
+
+void normal_fill(float* data, unsigned int numel, float mean, float std, mt19937_state* state) {
+    for (unsigned int t = 0; t < numel; t++) {
+        data[t] = randfloat32(state);
+    }
+    for (unsigned int i = 0; i < numel - 15; i += 16) {
+        normal_fill_16(data + i, mean, std, state);
+    }
+    if (numel % 16 != 0) {
+        // recompute the last 16 values
+        data = data + numel - 16;
+        for (unsigned int i = 0; i < 16; i++) {
+            data[i] = randfloat32(state);
+        }
+        normal_fill_16(data, mean, std, state);
+    }
+}
+
+void normal_(float* data, unsigned int numel, float mean, float std, mt19937_state* state) {
+    #define EPSILONE 1e-12
+    if (numel >= 16) {
+        normal_fill(data, numel, mean, std, state);
+    }
+    else {
+        double next_double_normal_sample;
+        int has_next_double_normal_sample = 0;
+        for (unsigned int  t = 0; t < numel; t++) {
+            if (has_next_double_normal_sample) {
+                data[t] = (float)(next_double_normal_sample * std + mean);
+                has_next_double_normal_sample = 0;
+                continue;
+            }
+            // for numel < 16 we draw a double (float64)
+            float u1 = randfloat64(state);
+            float u2 = randfloat64(state);
+            float radius = sqrtf(-2 * logf(1 - u2 + EPSILONE));
+            float theta = 2.0 * M_PI * u1;
+            next_double_normal_sample = radius * sinf(theta);
+            has_next_double_normal_sample = 1;
+            data[t] = (radius * cosf(theta) * std + mean);
+        }
+    }
+}
+
+#endif
\ No newline at end of file
diff --git a/test_gpt2.cu b/test_gpt2.cu
index 4407c9ba2..bde357f32 100644
--- a/test_gpt2.cu
+++ b/test_gpt2.cu
@@ -8,11 +8,16 @@ int check_tensor(float *a, float *b, int n, const char* label, float threshold=1
     int ok = 1;
     float max_diff = 0.0f;
     float max_rel_error = 0.0f;
+    float max_to_threshold = 0.f;
     float max_a = 0.0f;
     float max_b = 0.0f;
-    printf("%s\n", label);
+    float epsilon = 0.079;      // BF16 epsilon value
+    printf("---\n");
+    printf("checking tensor: %s\n", label);
     for (int i = 0; i < n; i++) {
+        float t_eff = threshold + fabs(b[i]) * epsilon;
         float diff = fabsf(a[i] - b[i]);
+        max_to_threshold = max(max_to_threshold, diff / t_eff);
         if (diff > max_diff) {
             max_diff = diff;
             float denom = fabsf(b[i]);
@@ -20,21 +25,22 @@ int check_tensor(float *a, float *b, int n, const char* label, float threshold=1
             max_a = a[i];
             max_b = b[i];
         }
-        if (diff <= threshold) {
-            if (i < print_upto) { printf("OK "); }
-        } else {
-            if (i < print_upto) { printf("NOT OK "); }
+        if (diff > t_eff) {
             ok = 0;
         }
-        if (i < print_upto) { printf("%f %f\n", a[i], b[i]); }
+        // print the first few elements so we can visually assess the "proof" of the comparison
+        if (i < print_upto) {
+            printf(diff <= t_eff ? "OK " :  "NOT OK ");
+            printf("%f %f\n", a[i], b[i]);
+        }
     }
     // print the final result
     if (ok) {
-        printf("TENSOR OK, max diff: %e, with rel error: %e (calculated=%f, ref=%f)\n",
-                max_diff, max_rel_error, max_a, max_b);
+        printf("TENSOR OK, max diff: %.3e, with rel error: %.3e (calculated=%10f, ref=%10f), %.2f%% of maximum error\n",
+                max_diff, max_rel_error, max_a, max_b, max_to_threshold*100);
     } else {
-        printf("TENSOR NOT OK, max diff: %e, with rel error: %e (calculated=%f, ref=%f)\n",
-                max_diff, max_rel_error, max_a, max_b);
+        printf("TENSOR NOT OK, max diff: %.3e, with rel error: %.3e (calculated=%10f, ref=%10f), %.2f%% of maximum error\n",
+                max_diff, max_rel_error, max_a, max_b, max_to_threshold*100);
     }
     return ok;
 }
@@ -83,45 +89,19 @@ float* float_cpu_malloc_and_point_parameters(FloatParameterTensors* params, size
 }
 
 int main(int argc, char *argv[]) {
+    multi_gpu_config = multi_gpu_config_init(&argc, &argv);
+    common_start(false, true);
 
-    // set up the device
-    int deviceIdx = 0;
-    cudaCheck(cudaSetDevice(deviceIdx));
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, deviceIdx);
-    printf("[System]\n");
-    printf("Device %d: %s\n", deviceIdx, deviceProp.name);
-
-    cuda_num_SMs = deviceProp.multiProcessorCount;
-    cuda_threads_per_SM = deviceProp.maxThreadsPerMultiProcessor;
-    cuda_arch_major = deviceProp.major;
-    cuda_arch_minor = deviceProp.minor;
-
-    cudaCheck(cudaStreamCreate(&main_stream));
-    cudaEventCreateWithFlags(&main_event, cudaEventDisableTiming);
-    cudaEventCreateWithFlags(&loss_event, cudaEventDisableTiming);
-    for (int i = 0; i < num_parallel_streams; i++) {
-        cudaCheck(cudaStreamCreate(&parallel_streams[i]));
-        cudaEventCreateWithFlags(&parallel_events[i], cudaEventDisableTiming);
-    }
-
-    // setup cuBLAS and cuBLASLt
-    cublasCheck(cublasCreate(&cublas_handle));
-    cublasCheck(cublasSetStream(cublas_handle, main_stream));
-    cublasCheck(cublasLtCreate(&cublaslt_handle));
-    // TF32 precision is equivalent to torch.set_float32_matmul_precision('high')
-    int enable_tf32 = cuda_arch_major >= 8 ? 1 : 0;
-    enable_tf32 = 0; // NOTE: disable TF32 for testing!!!
-    printf("enable_tf32: %d\n", enable_tf32);
-    cublas_compute_type = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F;
-    cublasMath_t cublas_math_mode = enable_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH;
-    cublasCheck(cublasSetMathMode(cublas_handle, cublas_math_mode));
-    cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size));
-    // set up cuDNN (noop if not available)
-    create_cudnn();
+    // set the right paths
+    #if defined(ENABLE_BF16)
+    const char* load_filename = "gpt2_124M_bf16.bin";
+    #else
+    const char* load_filename = "gpt2_124M.bin";
+    #endif
 
     // build the GPT-2 model from a checkpoint
     GPT2 model;
+
     gpt2_build_from_checkpoint(&model, load_filename);
     size_t V = model.config.vocab_size;
     size_t Vp = model.config.padded_vocab_size;
@@ -146,6 +126,8 @@ int main(int argc, char *argv[]) {
     printf("batch_size: %d\n", B);
     printf("seq_len: %d\n", T);
 
+    set_zero_configs(&multi_gpu_config, 0, model.num_parameters);
+
     // read reference information from the file saved from Python/PyTorch side
     // 1) input x and y
     int* x = (int*)mallocCheck(B * T * sizeof(int));
@@ -221,7 +203,7 @@ int main(int argc, char *argv[]) {
         clock_gettime(CLOCK_MONOTONIC, &start);
         gpt2_forward(&model, x, y, B, T);
         gpt2_zero_grad(&model);
-        gpt2_backward(&model);
+        gpt2_backward(&model, x);
         clock_gettime(CLOCK_MONOTONIC, &end);
         double time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
 
@@ -272,25 +254,27 @@ int main(int argc, char *argv[]) {
             // Also, if code changes and some of these get tripped, it could be ok if it's not by too much,
             // because our use of stochastic rounding is adding some non-determinism "pepper noise".
             // In that case it's ok to extend the tolerance by a bit, after a manual review.
-            allok = allok & check_tensor(tensors1[0], tensors2[0], V * C, "wte", 8e-1f);
-            allok = allok & check_tensor(tensors1[1], tensors2[1], maxT * C, "wpe", 1e-2f);
-            allok = allok & check_tensor(tensors1[2], tensors2[2], L * 3*C * C, "qkvw", 1.1e-1); // hmm a bit high
-            allok = allok & check_tensor(tensors1[3], tensors2[3], L * 3*C, "qkvb", 4e-2f);
-            allok = allok & check_tensor(tensors1[4], tensors2[4], L * C * C, "attprojw", 3e-2f);
+            // Also, different GPUs may use different matrix multiplication algorithms, so the
+            // actual errors can be hardware specific.
+            allok = allok & check_tensor(tensors1[0], tensors2[0], V * C, "wte", 6e-1f); // hmm a bit high
+            allok = allok & check_tensor(tensors1[1], tensors2[1], maxT * C, "wpe", 4e-3f);
+            allok = allok & check_tensor(tensors1[2], tensors2[2], L * 3*C * C, "qkvw", 1e-1); // hmm a bit high
+            allok = allok & check_tensor(tensors1[3], tensors2[3], L * 3*C, "qkvb", 3.5e-2f);
+            allok = allok & check_tensor(tensors1[4], tensors2[4], L * C * C, "attprojw", 2e-2f);
             allok = allok & check_tensor(tensors1[5], tensors2[5], L * C, "attprojb", 3e-2f);
-            allok = allok & check_tensor(tensors1[6], tensors2[6], L * 4*C * C, "fcw", 9e-2f); // hmm a bit high
-            allok = allok & check_tensor(tensors1[7], tensors2[7], L * 4*C, "fcb", 9e-2f); // hmm a bit high
-            allok = allok & check_tensor(tensors1[8], tensors2[8], L * C * 4*C, "fcprojw", 9e-2f); // hmm a bit high
-            allok = allok & check_tensor(tensors1[9], tensors2[9], L * C, "fcprojb", 3e-2f);
-            allok = allok & check_tensor(tensors1[10], tensors2[10], L * C, "ln1w", 0.1f); // hmm bit higher
-            allok = allok & check_tensor(tensors1[11], tensors2[11], L * C, "ln1b", 3e-2f);
-            allok = allok & check_tensor(tensors1[12], tensors2[12], L * C, "ln2w", 0.1f); // hmm bit higher
-            allok = allok & check_tensor(tensors1[13], tensors2[13], L * C, "ln2b", 3e-2f);
+            allok = allok & check_tensor(tensors1[6], tensors2[6], L * 4*C * C, "fcw", 5e-2f); // hmm a bit high
+            allok = allok & check_tensor(tensors1[7], tensors2[7], L * 4*C, "fcb", 5e-2f); // hmm a bit high
+            allok = allok & check_tensor(tensors1[8], tensors2[8], L * C * 4*C, "fcprojw", 5e-2f); // hmm a bit high
+            allok = allok & check_tensor(tensors1[9], tensors2[9], L * C, "fcprojb", 1.5e-2f);
+            allok = allok & check_tensor(tensors1[10], tensors2[10], L * C, "ln1w", 6e-4f);
+            allok = allok & check_tensor(tensors1[11], tensors2[11], L * C, "ln1b", 9e-3f);
+            allok = allok & check_tensor(tensors1[12], tensors2[12], L * C, "ln2w", 2e-3f);
+            allok = allok & check_tensor(tensors1[13], tensors2[13], L * C, "ln2b", 2.5e-3f);
             allok = allok & check_tensor(tensors1[14], tensors2[14], C, "lnfw", 0.12f); // hmm bit higher
-            allok = allok & check_tensor(tensors1[15], tensors2[15], C, "lnfb", 3e-2f);
+            allok = allok & check_tensor(tensors1[15], tensors2[15], C, "lnfb", 2e-2f);
         }
 
-        gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.01f, step+1);
+        gpt2_update(&model, 1e-4f, 0.9f, 0.95f, 1e-8f, 0.0f, 1.0f, step+1, &multi_gpu_config);
 
         // print the timing information at the end
         printf("step %d: loss %f (took %f ms)\n", step+1, model.mean_loss, time_elapsed_s * 1000);
@@ -299,16 +283,16 @@ int main(int argc, char *argv[]) {
 
     // expected losses are as follows, from Python
     float expected_losses[10] = {
-        5.270007133483887,
-        4.059706687927246,
-        3.3751230239868164,
-        2.8007826805114746,
-        2.315382242202759,
-        1.8490285873413086,
-        1.3946564197540283,
-        0.9991465210914612,
-        0.6240804195404053,
-        0.37651097774505615
+        5.2700,
+        4.0607,
+        3.3202,
+        2.7176,
+        2.1811,
+        1.6538,
+        1.1680,
+        0.7367,
+        0.4008,
+        0.1874
     };
 
     // compare
@@ -325,6 +309,7 @@ int main(int argc, char *argv[]) {
     printf("overall okay: %d\n", allok);
 
     // free everything
+    common_free(model);
     free(x);
     free(y);
     free(logits_cpu_raw);
@@ -334,11 +319,5 @@ int main(int argc, char *argv[]) {
     free(expected_grads_memory);
     free(grads_memory_cpu);
     free(grads_memory_cpu_float);
-    gpt2_free(&model);
-    destroy_cudnn();
-    cudaCheck(cudaFree(cublaslt_workspace));
-    cublasCheck(cublasDestroy(cublas_handle));
-    cublasCheck(cublasLtDestroy(cublaslt_handle));
-
     return 0;
 }
diff --git a/train_gpt2.c b/train_gpt2.c
index 9706a2c0b..b01abf09f 100644
--- a/train_gpt2.c
+++ b/train_gpt2.c
@@ -25,6 +25,8 @@ There will be other versions of this code that specialize it and make it fast.
 #include "utils.h"
 // defines: tokenizer_init, tokenizer_decode, tokenizer_free
 #include "tokenizer.h"
+// defines: dataloader_init, dataloader_reset, dataloader_next_batch, dataloader_free
+#include "dataloader.h"
 
 // ----------------------------------------------------------------------------
 // all the individual layers' forward and backward passes
@@ -158,32 +160,76 @@ void layernorm_backward(float* dinp, float* dweight, float* dbias,
     }
 }
 
-void matmul_forward(float* out,
-                    float* inp, float* weight, float* bias,
-                    int B, int T, int C, int OC) {
-    // most of the running time is spent here and in matmul_backward
-    // OC is short for "output channels"
-    // inp is (B,T,C), weight is (OC, C), bias is (OC)
-    // out will be (B,T,OC)
+void matmul_forward_naive(float* out,
+                         const float* inp, const float* weight, const float* bias,
+                         int B, int T, int C, int OC) {
+    // the most naive implementation of matrix multiplication
+    // this serves as an algorithmic reference, and as a fallback for
+    // unfriendly input shapes inside matmul_forward(), below.
     #pragma omp parallel for collapse(2)
     for (int b = 0; b < B; b++) {
         for (int t = 0; t < T; t++) {
-            float* out_bt = out + b * T * OC + t * OC;
-            float* inp_bt = inp + b * T * C + t * C;
+            int bt = b * T + t;
             for (int o = 0; o < OC; o++) {
                 float val = (bias != NULL) ? bias[o] : 0.0f;
-                float* wrow = weight + o*C;
                 for (int i = 0; i < C; i++) {
-                    val += inp_bt[i] * wrow[i];
+                    val += inp[bt * C + i] * weight[o*C + i];
+                }
+                out[bt * OC + o] = val;
+            }
+        }
+    }
+}
+
+void matmul_forward(float* out,
+                    const float* inp, const float* weight, const float* bias,
+                    int B, int T, int C, int OC) {
+    // most of the running time is spent here and in matmul_backward
+    // therefore, the implementation below is very mildly optimized
+    // this function is otherwise identical to that of matmul_forward_naive()
+    // OC is short for "output channels"
+    // inp is (B,T,C), weight is (OC, C), bias is (OC)
+    // out will be (B,T,OC)
+
+    // make sure the tiled loop will be correct or fallback to naive version
+    const int LOOP_UNROLL = 8;
+    if (B*T % LOOP_UNROLL != 0) {
+        matmul_forward_naive(out, inp, weight, bias, B, T, C, OC);
+        return;
+    }
+
+    // collapse the B and T loops into one and turn it into a strided loop.
+    // then we can tile the inner loop, and reuse the loaded weight LOOP_UNROLL many times
+    #pragma omp parallel for
+    for (int obt = 0; obt < B * T; obt += LOOP_UNROLL) {
+        for (int o = 0; o < OC; o++) {
+            // we'll keep LOOP_UNROLL many results in registers
+            float result[LOOP_UNROLL];
+            // initialize the bias, if it exists
+            for (int ibt = 0; ibt < LOOP_UNROLL; ibt++) {
+                result[ibt] = (bias != NULL) ? bias[o] : 0.0f;
+            }
+            // inner loops. Because we do LOOP_UNROLL steps of inner bt, we can cache
+            // the value of weight[i + o * C] and reuse it.
+            // we compile with -Ofast, so the compiler will turn the inner loop into FMAs
+            for (int i = 0; i < C; i++) {
+                float w = weight[i + o * C];
+                for (int ibt = 0; ibt < LOOP_UNROLL; ibt++) {
+                    int bt = obt + ibt;
+                    result[ibt] += inp[bt * C + i] * w;
                 }
-                out_bt[o] = val;
+            }
+            // write back results to main memory
+            for (int ibt = 0; ibt < LOOP_UNROLL; ibt++) {
+                int bt = obt + ibt;
+                out[bt * OC + o] = result[ibt];
             }
         }
     }
 }
 
 void matmul_backward(float* dinp, float* dweight, float* dbias,
-                     float* dout, float* inp, float* weight,
+                     const float* dout, const float* inp, const float* weight,
                      int B, int T, int C, int OC) {
     // most of the running time is spent here and in matmul_forward
     // this backward could be done in a single "round" of loops
@@ -193,10 +239,10 @@ void matmul_backward(float* dinp, float* dweight, float* dbias,
     #pragma omp parallel for collapse(2)
     for (int b = 0; b < B; b++) {
         for (int t = 0; t < T; t++) {
-            float* dout_bt = dout + b * T * OC + t * OC;
+            const float* dout_bt = dout + b * T * OC + t * OC;
             float* dinp_bt = dinp + b * T * C + t * C;
             for (int o = 0; o < OC; o++) {
-                float* wrow = weight + o*C;
+                const float* wrow = weight + o*C;
                 float d = dout_bt[o];
                 for (int i = 0; i < C; i++) {
                     dinp_bt[i] += wrow[i] * d;
@@ -209,8 +255,8 @@ void matmul_backward(float* dinp, float* dweight, float* dbias,
     for (int o = 0; o < OC; o++) {
         for (int b = 0; b < B; b++) {
             for (int t = 0; t < T; t++) {
-                float* dout_bt = dout + b * T * OC + t * OC;
-                float* inp_bt = inp + b * T * C + t * C;
+                const float* dout_bt = dout + b * T * OC + t * OC;
+                const float* inp_bt = inp + b * T * C + t * C;
                 float* dwrow = dweight + o*C;
                 float d = dout_bt[o];
                 if (dbias != NULL) { dbias[o] += d; }
@@ -992,84 +1038,9 @@ void gpt2_free(GPT2 *model) {
 
 #ifndef TESTING
 // if we are TESTING (see test_gpt2.c), we'll skip the int main below
-
-// ----------------------------------------------------------------------------
-// data loader lite
-// returns random batches of data from a file of integers
-
-typedef struct {
-    // hyperparameters
-    int B; // batch size
-    int T; // sequence length
-    // input handling and its state
-    FILE* tokens_file;
-    long file_size;
-    long current_position;
-    // output memory
-    int* batch;
-    int* inputs;
-    int* targets;
-    // convenience variables
-    int num_batches;
-} DataLoader;
-
-void dataloader_init(DataLoader *loader, const char* filename, int B, int T) {
-    loader->B = B;
-    loader->T = T;
-
-    // open the input file for reading
-    loader->tokens_file = fopen(filename, "rb");
-    if (loader->tokens_file == NULL) {
-        printf("Error opening tokens file\n");
-        exit(1);
-    }
-
-    // determine the file size
-    fseekCheck(loader->tokens_file, 0, SEEK_END);
-    loader->file_size = ftell(loader->tokens_file);
-    fseekCheck(loader->tokens_file, 0, SEEK_SET);
-    if (loader->file_size < (B * T + 1) * sizeof(int)) {
-        printf("Error: file size is too small for the batch size and sequence length\n");
-        exit(1);
-    }
-    loader->current_position = 0; // start at the beginning
-
-    // allocate space for B*T + 1 integers to store the inputs and targets
-    loader->batch = (int*) mallocCheck((B * T + 1) * sizeof(int));
-    loader->inputs = loader->batch;
-    loader->targets = loader->batch + 1; // targets are shifted by one
-    loader->num_batches = loader->file_size / (B * T * sizeof(int));
-}
-
-void dataloader_reset(DataLoader *loader) {
-    loader->current_position = 0;
-}
-
-void dataloader_next_batch(DataLoader *loader) {
-    int B = loader->B;
-    int T = loader->T;
-    // if we are at the end of the file, loop back to the beginning
-    if (loader->current_position + (B*T+1) * sizeof(int) > loader->file_size) {
-        loader->current_position = 0;
-    }
-    // read the B*T+1 integers from the file into batch
-    fseekCheck(loader->tokens_file, loader->current_position, SEEK_SET);
-    freadCheck(loader->batch, sizeof(int), B*T+1, loader->tokens_file);
-    // advance the current position by B*T integers
-    loader->current_position += B*T * sizeof(int);
-}
-
-void dataloader_free(DataLoader *loader) {
-    fcloseCheck(loader->tokens_file);
-    free(loader->batch);
-}
-
 // ----------------------------------------------------------------------------
 // sampler
 
-// the GPT-2 end-of-text token id
-#define GPT2_EOT 50256
-
 unsigned int random_u32(unsigned long long *state) {
     // xorshift rng: https://en.wikipedia.org/wiki/Xorshift#xorshift.2A
     *state ^= *state >> 12;
@@ -1103,20 +1074,19 @@ int main() {
     gpt2_build_from_checkpoint(&model, "gpt2_124M.bin");
 
     // build the DataLoaders from tokens files. for now use tiny_shakespeare if available, else tiny_stories
-    const char* tiny_stories_train = "data/TinyStories_train.bin";
-    const char* tiny_stories_val = "data/TinyStories_val.bin";
-    const char* tiny_shakespeare_train = "data/tiny_shakespeare_train.bin";
-    const char* tiny_shakespeare_val = "data/tiny_shakespeare_val.bin";
+    const char* tiny_stories_train = "dev/data/tinystories/TinyStories_train.bin";
+    const char* tiny_stories_val = "dev/data/tinystories/TinyStories_val.bin";
+    const char* tiny_shakespeare_train = "dev/data/tinyshakespeare/tiny_shakespeare_train.bin";
+    const char* tiny_shakespeare_val = "dev/data/tinyshakespeare/tiny_shakespeare_val.bin";
     const char* train_tokens = access(tiny_shakespeare_train, F_OK) != -1 ? tiny_shakespeare_train : tiny_stories_train;
     const char* val_tokens = access(tiny_shakespeare_val, F_OK) != -1 ? tiny_shakespeare_val : tiny_stories_val;
     int B = 4; // batch size 4 (i.e. 4 independent token sequences will be trained on)
     int T = 64; // sequence length 64 (i.e. each sequence is 64 tokens long). must be <= maxT, which is 1024 for GPT-2
-    DataLoader train_loader;
-    dataloader_init(&train_loader, train_tokens, B, T);
-    printf("train dataset num_batches: %d\n", train_loader.num_batches);
-    DataLoader val_loader;
-    dataloader_init(&val_loader, val_tokens, B, T);
-    printf("val dataset num_batches: %d\n", val_loader.num_batches);
+    DataLoader train_loader, val_loader;
+    dataloader_init(&train_loader, train_tokens, B, T, 0, 1);
+    dataloader_init(&val_loader, val_tokens, B, T, 0, 1);
+    printf("train dataset num_batches: %zu\n", train_loader.num_batches);
+    printf("val dataset num_batches: %zu\n", val_loader.num_batches);
     int val_num_batches = 5;
 
     // build the Tokenizer
@@ -1149,7 +1119,7 @@ int main() {
         if (step > 0 && step % 20 == 0) {
             // fill up gen_tokens with the GPT2_EOT, which kicks off the generation
             for(int i = 0; i < B * T; ++i) {
-                gen_tokens[i] = GPT2_EOT;
+                gen_tokens[i] = tokenizer.eot_token;
             }
             // now sample from the model autoregressively
             printf("generating:\n---\n");
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 6d02b8d3e..2488e0f27 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -1,5 +1,6 @@
 /*
 GPT-2 Transformer Neural Net trained in raw CUDA
+GPT-2 Transformer Neural Net trained in raw CUDA
 Non-trivial notes to be aware of:
 
 We are being clever in the backward pass to conserve memory.
@@ -11,7 +12,7 @@ sure that those parts work out ok and that we do a += as necessary. E.g.,
 the layernorms are connected to the residuals so we += in layernorm backward.
 
 In this file we are using Mixed Precision training, so different activations,
-paramaters, grads and buffers may be kept at different precisions, to take
+parameters, grads and buffers may be kept at different precisions, to take
 advantage of the fast low-precision hardware in the latest GPUs (bf16/fp16),
 and fp8 (coming soon^TM).
 
@@ -33,26 +34,23 @@ This reads & runs in fp32, B=4, T=64, LR=1e-4, val/sample never (200),
 -a 1 is "overfit single batch", -x 10 is 10 iterations, and -f 0 disables tf32
 */
 
-#include <string>
-
+#include <unistd.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdarg.h>
-#include <math.h>
-#include <time.h>
-#include <assert.h>
-#include <float.h>
-#include <string.h>
-#include <unistd.h>
-
+#include <string>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <vector>
+#include <algorithm>
+#include <functional>
+#include <unordered_map>
 // GPU / CUDA related
-#include <cublas_v2.h>
-#include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
+#include <cublas_v2.h>
 #include <cublasLt.h>
-#include <cuda_bf16.h>
 #include <nvtx3/nvToolsExt.h>
-
+#include <cuda_profiler_api.h>
 // Multi-GPU related
 #ifdef MULTI_GPU
 #include <mpi.h>
@@ -63,7 +61,12 @@ This reads & runs in fp32, B=4, T=64, LR=1e-4, val/sample never (200),
 #include "utils.h"
 // defines: tokenizer_init, tokenizer_decode, tokenizer_free
 #include "tokenizer.h"
-
+// defines: dataloader_init, dataloader_reset, dataloader_next_batch, dataloader_free
+// defines: evalloader_init, evalloader_reset, evalloader_next_batch, evalloader_free
+#include "dataloader.h"
+// defines: manual_seed, normal_
+// numerically identical to PyTorch's torch.manual_seed and torch.normal
+#include "rand.h"
 // ----------------------------------------------------------------------------
 // CUDA precision settings
 
@@ -73,20 +76,11 @@ enum PrecisionMode {
     PRECISION_BF16
 };
 
-// Default Properties
-typedef float floatN;
-#define CUBLAS_LOWP_COMPUTE cublas_compute_type
-#ifdef MULTI_GPU
-const ncclDataType_t ncclFloatN = ncclFloat;
-#endif
-
 // Specific configurations based on the enabled precision
 #if defined(ENABLE_FP32)
 typedef float floatX;
 #define CUBLAS_LOWP CUDA_R_32F
 #define PRECISION_MODE PRECISION_FP32
-const char* load_filename = "gpt2_124M.bin";
-const char* precision_mode_str = "fp32";
 #ifdef MULTI_GPU
 const ncclDataType_t ncclFloatX = ncclFloat;
 #endif
@@ -96,8 +90,6 @@ const ncclDataType_t ncclFloatX = ncclFloat;
 typedef half floatX;
 #define CUBLAS_LOWP CUDA_R_16F
 #define PRECISION_MODE PRECISION_FP16
-const char* load_filename = "gpt2_124M.bin";
-const char* precision_mode_str = "fp16";
 #ifdef MULTI_GPU
 const ncclDataType_t ncclFloatX = ncclHalf;
 #endif
@@ -106,8 +98,6 @@ const ncclDataType_t ncclFloatX = ncclHalf;
 typedef __nv_bfloat16 floatX;
 #define CUBLAS_LOWP CUDA_R_16BF
 #define PRECISION_MODE PRECISION_BF16
-const char* load_filename = "gpt2_124M_bf16.bin"; // bf16 weights specific filename
-const char* precision_mode_str = "bf16";
 #ifdef MULTI_GPU
 const ncclDataType_t ncclFloatX = ncclBfloat16;
 #endif
@@ -128,24 +118,24 @@ class NvtxRange {
 };
 #define NVTX_RANGE_FN() NvtxRange nvtx_range(__FUNCTION__)
 
+// try to make sure that 2 blocks fit on A100/H100 to maximise latency tolerance
+// this needs to be defines rather than queried to be used for __launch_bounds__
+#if __CUDA_ARCH__ == 800 || __CUDA_ARCH__ >= 900
+#define MAX_1024_THREADS_BLOCKS 2
+#else
+#define MAX_1024_THREADS_BLOCKS 1
+#endif
+
+// WarpSize is not a compile time constant, this allows the compiler to optimize
+#define WARP_SIZE 32U
+
 // cuBLAS workspace. Hardcoding to 32MiB but only Hopper needs 32, for others 4 is OK
-static size_t cublaslt_workspace_size = 32 * 1024 * 1024;
-static void* cublaslt_workspace = NULL;
-static cublasComputeType_t cublas_compute_type;
-cublasHandle_t cublas_handle;
+const size_t cublaslt_workspace_size = 32 * 1024 * 1024;
+void* cublaslt_workspace = NULL;
+cublasComputeType_t cublas_compute = CUBLAS_COMPUTE_32F;
 cublasLtHandle_t cublaslt_handle;
-int cuda_arch_major = 0;
-int cuda_arch_minor = 0;
-int cuda_num_SMs = 0; // for persistent threads where we want 1 threadblock per SM
-int cuda_threads_per_SM = 0;
-
-// CUDA streams & events (note: non-timing events, use separate event for timing/profiling!)
-constexpr int num_parallel_streams = 2; // + 1 primary "main_stream" (+ default stream)
-cudaStream_t parallel_streams[num_parallel_streams];
-cudaEvent_t parallel_events[num_parallel_streams];
-cudaStream_t main_stream;
-cudaEvent_t main_event;
-cudaEvent_t loss_event; // to make sure fused_classifier has written the losses to the CPU buffer
+cublasHandle_t cublas_handle;
+cudaDeviceProp deviceProp;
 
 // convenience macro for calculating grid/block dimensions for kernels
 #define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
@@ -153,8 +143,7 @@ cudaEvent_t loss_event; // to make sure fused_classifier has written the losses
 // CUDA error checking
 void cudaCheck(cudaError_t error, const char *file, int line) {
   if (error != cudaSuccess) {
-    printf("[CUDA ERROR] at file %s:%d:\n%s\n", file, line,
-           cudaGetErrorString(error));
+    printf("[CUDA ERROR] at file %s:%d:\n%s\n", file, line, cudaGetErrorString(error));
     exit(EXIT_FAILURE);
   }
 };
@@ -191,35 +180,21 @@ void mpi_check(int status, const char *file, int line) {
 #define mpiCheck(err) (mpi_check(err, __FILE__, __LINE__))
 #endif
 
-// GPU helper functions for atomicAdd on smaller than 32-bit types
-#ifdef ENABLE_BF16
-__device__ void atomicAddX(__nv_bfloat16* addr, __nv_bfloat16 val) {
-    uintptr_t ptr_val = reinterpret_cast<uintptr_t>(addr);
-    __nv_bfloat162* ptr_bf16 = reinterpret_cast<__nv_bfloat162*>(ptr_val & ~uintptr_t(0x3));
-
-    // Prepare the value to add, setting the other half to zero
-    __nv_bfloat162 add_val = (ptr_val & 0x3) ? __halves2bfloat162(__ushort_as_bfloat16(0), val)
-                                             : __halves2bfloat162(val, __ushort_as_bfloat16(0));
-    atomicAdd(ptr_bf16, add_val);
+// older nvcc does not provide __ldcs and __stcs for bfloat16, despite these actually just being unsigned shorts.
+// we need to be careful here to only define our own versions if none already exist, otherwise the compiler will
+// complain.
+// If not, you easily get "no viable overload" (for sm52) and "function already exists" (sm_80)
+#if defined(ENABLE_BF16) && (__CUDACC_VER_MAJOR__ < 12) && !((__CUDA_ARCH__ >= 800) || !defined(__CUDA_ARCH__))
+__device__ floatX __ldcs(const floatX* address) {
+    unsigned short bf = __ldcs(reinterpret_cast<const unsigned short*>(address));
+    return __nv_bfloat16_raw{bf};
 }
-#endif
-
-#ifdef ENABLE_FP16
-__device__ void atomicAddX(half* addr, half val) {
-    uintptr_t ptr_val = reinterpret_cast<uintptr_t>(addr);
-    half2* ptr_fp16 = reinterpret_cast<half2*>(ptr_val & ~uintptr_t(0x3));
 
-    // Prepare the value to add, setting the other half to zero
-    half2 add_val = (ptr_val & 0x3) ? __halves2half2(__ushort_as_half(0), val)
-                                    : __halves2half2(val, __ushort_as_half(0));
-    atomicAdd(ptr_fp16, add_val);
+__device__ void __stcs(floatX* address, floatX value) {
+    __stcs(reinterpret_cast<unsigned short*>(address), ((__nv_bfloat16_raw)value).x);
 }
 #endif
 
-__device__ void atomicAddX(float* addr, float val) {
-    atomicAdd(addr, val);
-}
-
 // warp-level reduction for summing values
 __device__ float warpReduceSum(float val) {
     for (int offset = 16; offset > 0; offset /= 2) {
@@ -227,7 +202,6 @@ __device__ float warpReduceSum(float val) {
     }
     return val;
 }
-
 // warp-level reduction for finding the maximum value
 __device__ float warpReduceMax(float val) {
     for (int offset = 16; offset > 0; offset /= 2) {
@@ -235,16 +209,6 @@ __device__ float warpReduceMax(float val) {
     }
     return val;
 }
-
-#if defined(ENABLE_BF16) || defined(ENABLE_FP16)
-__device__ floatX warpReduceSum(floatX val) {
-    for (int offset = 16; offset > 0; offset /= 2) {
-        val += __shfl_xor_sync(0xFFFFFFFF, val, offset);
-    }
-    return val;
-}
-#endif
-
 // requires all 32 threads in the warp to be active, but should work for any block size
 // uses non-dynamic shared memory so every call increases shared memory requirements by 128 bytes
 // the fact it's unique shared memory allows us to avoid an extra __syncthreads() call at the end
@@ -254,15 +218,14 @@ template<reduction_func_t warp_reduction>
 __device__ float blockReduce(float val, bool final_sync=false, float out_of_bounds=0.0f) {
     // two reductions of up to 1024 threads:
     // 1) inside warp (shuffle), 2) cross-warp (shared memory), 3) inside warp (shuffle)
-    __shared__ float shared_val[32];
-    int lane_id = threadIdx.x % 32;
-    int warp_id = threadIdx.x / 32;
-    int num_warps = blockDim.x / 32;
+    __shared__ float shared_val[WARP_SIZE];
+    const int lane_id = threadIdx.x % WARP_SIZE;
+    const int warp_id = threadIdx.x / WARP_SIZE;
+    const int num_warps = blockDim.x / WARP_SIZE;
 
     float warp_val = warp_reduction(val);
     if (lane_id == 0) { shared_val[warp_id] = warp_val; }
     __syncthreads();
-    // same strategy, now reduce across warps
     warp_val = (lane_id < num_warps) ? shared_val[lane_id] : out_of_bounds;
     float block_val = warp_reduction(warp_val);
 
@@ -272,7 +235,6 @@ __device__ float blockReduce(float val, bool final_sync=false, float out_of_boun
     return block_val;
 }
 
-
 // ----------------------------------------------------------------------------
 // Packed128 data structure, which forces the compiler to use 128-bit loads/stores
 // in GPUs that support (the LDG.128 and STS.128 instructions)
@@ -281,12 +243,11 @@ __device__ float blockReduce(float val, bool final_sync=false, float out_of_boun
 
 template<class ElementType>
 struct alignas(16) Packed128 {
-    __device__ Packed128() = default;
+    Packed128() = default;
     __device__ explicit Packed128(int4 bits) {
         static_assert(sizeof(bits) == sizeof(payload), "Size mismatch.");
         memcpy(&payload, &bits, sizeof(bits));
     }
-
     __device__ ElementType& operator[](int index) {
         return payload[index];
     }
@@ -299,38 +260,39 @@ struct alignas(16) Packed128 {
         memcpy(&bits, &payload, sizeof(bits));
         return bits;
     }
-
     static constexpr const size_t size = sizeof(int4) / sizeof(ElementType);
     ElementType payload[size];
 };
 
-// short-form typedef
-typedef Packed128<float> f128;
-typedef Packed128<floatX> x128;
-
 // load a Packed128 from an aligned memory address
 template<class ElementType>
 __device__ Packed128<ElementType> load128(const ElementType* address) {
     return Packed128<ElementType>{*reinterpret_cast<const int4*>(address)};
 }
-
 // load a Packed128 from an aligned memory address with streaming cache hint
 template<class ElementType>
 __device__ Packed128<ElementType> load128cs(const ElementType* address) {
     return Packed128<ElementType>{__ldcs(reinterpret_cast<const int4*>(address))};
 }
-
 // store a Packed128 to an aligned memory address
 template<class ElementType>
 __device__ void store128(ElementType* target, Packed128<ElementType> value) {
     *reinterpret_cast<int4*>(target) = value.get_bits();
 }
-
 // store a Packed128 to an aligned memory address with streaming cache hint
 template<class ElementType>
 __device__ void store128cs(ElementType* target, Packed128<ElementType> value) {
     __stcs(reinterpret_cast<int4*>(target), value.get_bits());
 }
+// store a Packed128 to an aligned memory address while caching in L2 but bypassing L1
+template<class ElementType>
+__device__ void store128cg(ElementType* target, Packed128<ElementType> value) {
+    __stcg(reinterpret_cast<int4*>(target), value.get_bits());
+}
+
+// short-form typedefs
+typedef Packed128<float> f128;
+typedef Packed128<floatX> x128;
 
 // ----------------------------------------------------------------------------
 // Random Number Generatiom
@@ -372,25 +334,11 @@ __device__ __host__ constexpr unsigned int SquirrelNoise5(int positionX, unsigne
 	mangledBits ^= (mangledBits >> 17);
 	return mangledBits;
 }
-__device__ __host__ constexpr unsigned int Get1dNoiseUint(int positionX, unsigned int seed)
-{
-	return SquirrelNoise5(positionX, seed);
-}
 __device__ __host__ constexpr unsigned int Get2dNoiseUint(int indexX, int indexY, unsigned int seed)
 {
 	constexpr int PRIME_NUMBER = 198491317; // Large prime number with non-boring bits
 	return SquirrelNoise5(indexX + (PRIME_NUMBER * indexY), seed);
 }
-__device__ __host__ constexpr float Get1dNoiseZeroToOne(int index, unsigned int seed)
-{
-	constexpr double ONE_OVER_MAX_UINT = (1.0 / (double) 0xFFFFFFFF);
-	return (float)(ONE_OVER_MAX_UINT * (double) SquirrelNoise5(index, seed));
-}
-__device__ __host__ constexpr float Get2dNoiseZeroToOne(int indexX, int indexY, unsigned int seed)
-{
-	constexpr double ONE_OVER_MAX_UINT = (1.0 / (double) 0xFFFFFFFF);
-	return (float)(ONE_OVER_MAX_UINT * (double) Get2dNoiseUint(indexX, indexY, seed));
-}
 
 // stochastic rounding built on top of Squirel Noise above (with seed updated per step via xorshift)
 __device__ __forceinline__ void stochastic_rounding(float in, __nv_bfloat16 *out, unsigned int seed) {
@@ -417,6 +365,15 @@ typedef struct {
     int process_rank;      // Rank of this process among all MPI processes. 0 if no multi-GPU.
     int num_processes;     // Total number of processes. 1 if no multi-GPU.
     int local_device_idx;  // This process GPU index on current machine. 0 if no multi-GPU.
+
+    // Zero Redundancy Optimizer stage - https://fairscale.readthedocs.io/en/stable/deep_dive/oss_sdp_fsdp.html
+    // 0-Disabled
+    // 1-Optimizer State Sharding (OSS)
+    // 2-Optimizer + Gradient State Sharding (SDP)
+    // 3-Optimizer + Gradient + Horizontal Model Sharding (FSDP)
+    int zero_stage;
+    size_t shard_num_parameters;
+    size_t shard_offset;
 #ifdef MULTI_GPU
     ncclComm_t nccl_comm;  // NCCL communication primitive, used for collective multi-GPU work.
 #endif
@@ -484,6 +441,7 @@ MultiGpuConfig multi_gpu_config_init(int *argc, char ***argv) {
     return result;
 #else
     printf("Multi-GPU support is disabled. Using a single GPU.\n");
+    cudaCheck(cudaSetDevice(0));
     MultiGpuConfig result;
     result.process_rank = 0;
     result.num_processes = 1;
@@ -509,6 +467,34 @@ void printf0(const char *format, ...) {
     }
 }
 
+void set_zero_configs(MultiGpuConfig* multi_gpu_config, int zero_stage, size_t total_parameters) {
+
+    multi_gpu_config->zero_stage = 0;
+    multi_gpu_config->shard_num_parameters = total_parameters;
+    multi_gpu_config->shard_offset = 0;
+
+    // Check the Zero Stage and define sharding parameters
+    if (zero_stage == 0) {
+        printf0("| Zero Optimization is disabled                                              |\n");
+    }
+    else if (zero_stage == 1) {
+        if (total_parameters % multi_gpu_config->num_processes != 0) {
+            printf0("| Zero Optimization is disabled, Can't equally partition parameters          |\n");
+            multi_gpu_config->zero_stage = 0;
+        }
+        else {
+            printf0("| Zero Stage1 is enabled                                                     |\n");
+            multi_gpu_config->zero_stage = 1;
+            multi_gpu_config->shard_num_parameters = total_parameters / multi_gpu_config->num_processes;
+            multi_gpu_config->shard_offset = multi_gpu_config->process_rank * multi_gpu_config->shard_num_parameters;
+        }
+    }
+    else{
+        printf0("| Disabling Zero Optimization, Zero Stage2 and Stage3 are not yet supported  |\n");
+        multi_gpu_config->zero_stage = 0;
+    }
+}
+
 // ----------------------------------------------------------------------------
 // cuDNN path
 #ifdef ENABLE_CUDNN
@@ -536,60 +522,138 @@ __global__ void encoder_forward_kernel3(floatX* out,
                                int B, int T, int C) {
     int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
     int N = B * T * C;
-    if (idx < N) {
-        int bt = idx / C;
+    if (idx >= N) { return; }
+
+    int bt = idx / C;
+    int b = bt / T;
+    int t = bt % T;
+    int c = idx % C;
+
+    int ix = inp[b * T + t];
+
+    floatX* out_btc = out + b * T * C + t * C + c;
+    const floatX* wte_ix = wte + ix * C + c;
+    const floatX* wpe_tc = wpe + t * C + c;
+
+    x128 packed_out;
+    x128 wte128 = load128cs(wte_ix);
+    x128 wpe128 = load128cs(wpe_tc);
+    for (int k = 0; k < x128::size; k++) {
+        packed_out[k] = (floatX)((float)wte128[k] + (float)wpe128[k]);
+    }
+    store128(out_btc, packed_out);
+}
+
+template <int BLOCK_SIZE=256>
+__global__ void wte_backward_kernel(floatX* dwte,
+                                    const int4* bucket_info, const int* workload_indices, const floatX* dout, const int* inp,
+                                    unsigned int seed, int B, int T, int C) {
+    // In order to be deterministic, we preprocess the inputs on the cpu into "buckets"
+    // Each bucket corresponds to (WARP_SIZE * x128::size) channels for a single vocabulary token
+    // Each thread handles x128::size channels, e.g. 256 per warp for BF16
+    // Each block handles (BLOCK_SIZE / WARP_SIZE) elements in a single bucket in parallel
+    // If a bucket has less than 8 elements, some warps will return immediately
+    // If a bucket has more than 8 elements, we will loop over all of them
+    // The buckets are sorted on the CPU so the largest buckets start 1st
+    int bucket = blockIdx.x;
+    int warp_id = threadIdx.x / WARP_SIZE;
+    int lane_id = threadIdx.x % WARP_SIZE;
+    int c_per_warp = WARP_SIZE * x128::size;
+
+    int bucket_start_idx = bucket_info[bucket].x;
+    int bucket_size = bucket_info[bucket].y;
+    int bucket_ix = bucket_info[bucket].z;
+    int c = bucket_info[bucket].w * c_per_warp + (lane_id * x128::size);
+
+    // Each thread handles "x128::size" channels, so at fp8, each warp would handle 512 channels
+    // If C is not a multiple of this (e.g. 768), some buckets/c_groups cannot use the entire warp
+    if (c >= C) { return; }
+    // Exit early if this is a small bucket and this warp doesn't have any items to process
+    if (warp_id >= bucket_size) { return; }
+
+    float accum[x128::size] = {0.0f};
+    __shared__ float accum_shared[x128::size * BLOCK_SIZE];
+
+    for(int item = warp_id; item < bucket_size; item += BLOCK_SIZE/WARP_SIZE) {
+        int bt = workload_indices[bucket_start_idx + item];
         int b = bt / T;
         int t = bt % T;
-        int c = idx % C;
 
-        int ix = inp[b * T + t];
+        const floatX* dout_btc = dout + b * T * C + t * C + c;
+        x128 packed_inp1 = load128cs(dout_btc);
+        for (int k = 0; k < packed_inp1.size; k++) {
+            accum[k] += (float)packed_inp1[k];
+        }
+    }
 
-        floatX* out_btc = out + b * T * C + t * C + c;
-        const floatX* wte_ix = wte + ix * C + c;
-        const floatX* wpe_tc = wpe + t * C + c;
+    if (warp_id != 0) {
+        // we accumulate into warp 0, so only the other warps need to write to shared memory
+        for (int k = 0; k < x128::size; k++) {
+            accum_shared[threadIdx.x + k * BLOCK_SIZE] = accum[k];
+        }
+        return; // only warp 0 is needed after writing to shared memory
+    }
+
+    // Read dwte for warp 0 even if other warps are not finished yet to maximise latency tolerance
+    floatX* dwte_ix = dwte + bucket_ix * C + c;
+    x128 packed_in_out = load128(dwte_ix);
+
+    // note: threads which have returned are considered synchronised by CUDA so no risk of deadlock
+    __syncthreads();
 
-        x128 packed_out;
-        x128 wte = load128cs(wte_ix);
-        x128 wpe = load128cs(wpe_tc);
-        #pragma unroll
-        for (int k = 0; k < wte.size; k++) {
-            packed_out[k] = (floatX)((float)wte[k] + (float)wpe[k]);
+    // Accumulate into warp 0's registers by reading the values of the other warps in shared memory
+    for (int i = threadIdx.x+WARP_SIZE; i < min(BLOCK_SIZE, bucket_size*WARP_SIZE); i += WARP_SIZE) {
+        for (int k = 0; k < x128::size; k++) {
+            accum[k] += accum_shared[i + k * BLOCK_SIZE];
         }
-        store128(out_btc, packed_out);
     }
-}
 
-// really bad naive kernel with atomicAdd
-__global__ void encoder_backward_kernel(floatX* dwte, floatX* dwpe,
-                                        const floatX* dout, const int* inp,
-                                        int B, int T, int C) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int N = B * T * C;
+    // Add the result to dwte and write back to global memory (read-modify-write)
+    for (unsigned int k = 0; k < x128::size; k++) {
+        // We use stochastic rounding to go from FP32 to BF16 but the seed should be deterministic
+        stochastic_rounding(accum[k] + (float)packed_in_out[k], &packed_in_out[k], seed + k);
+    }
+    store128(dwte_ix, packed_in_out);
+}
 
-    if (idx < N) {
-        int bt = idx / C;
-        int b = bt / T;
-        int t = bt % T;
-        int c = idx % C;
+__global__ void wpe_backward_kernel(floatX* dwpe,
+                                    const floatX* dout, const int* inp,
+                                    int B, int T, int C, unsigned int seed) {
+    // Each thread handles x128::size "channel positions", e.g. 256 per warp for BF16
+    // For gpt2-124M BF16, C=768 and T=1024, so 3 warps per channel and 3072 warps in total
+    // For each "channel position" we sum the gradients for every batch at that C/T element
+    // This way each dwte element is only updated once, and the kernel is fully deterministic!
+    // The previous kernel was not deterministic, as batches were aggregated with atomicAdd
+    int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
+    if (idx >= T * C) { return; }
 
-        int ix = inp[b * T + t];
+    // if C is not a multiple of WARP_SIZE*x128::size, it's OK for some warps to handle multiple t
+    int t = idx / C;
+    int c = idx % C;
+    float accum[x128::size] = {0.0f};
 
-        const floatX* dout_btc = dout + b * T * C + t * C + c;
-        floatX* dwte_ix = dwte + ix * C + c;
-        floatX* dwpe_tc = dwpe + t * C + c;
+    for (int b = 0; b < B; b++) {
+        x128 packed_dout = load128cs(dout + (b * T * C) + (t * C) + c); // will never be read again
+        for (int k = 0; k < x128::size; k++) {
+            accum[k] += (float)packed_dout[k];
+        }
+    }
 
-        atomicAddX(dwte_ix, (floatX)*dout_btc);
-        atomicAddX(dwpe_tc, (floatX)*dout_btc);
+    floatX* dwpe_tc = dwpe + (t * C) + c;
+    x128 packed_dwpe = load128(dwpe_tc);
+    for (unsigned int k = 0; k < x128::size; k++) {
+        // We use stochastic rounding to go from FP32 to BF16 but the seed should be deterministic
+        stochastic_rounding(accum[k] + (float)packed_dwpe[k], &packed_dwpe[k], seed + k);
     }
+    store128(dwpe_tc, packed_dwpe);
 }
 
 __global__ void layernorm_forward_kernel3(floatX* __restrict__ out, floatX* __restrict__ mean, floatX* __restrict__ rstd,
                                     const floatX*  __restrict__ inp, const floatX*  __restrict__ weight,
                                     const floatX* __restrict__ bias, int N, int C) {
-    const int warp_size = 32;
-    int lane_id = threadIdx.x % warp_size;
-    int warp_id = threadIdx.x / warp_size;
-    int num_warps = blockDim.x / warp_size;
+    int lane_id = threadIdx.x % WARP_SIZE;
+    int warp_id = threadIdx.x / WARP_SIZE;
+    int num_warps = blockDim.x / WARP_SIZE;
 
     int idx = blockIdx.x * num_warps + warp_id;
     if(idx >= N) { return; } // guard
@@ -599,7 +663,7 @@ __global__ void layernorm_forward_kernel3(floatX* __restrict__ out, floatX* __re
 
     // mean
     float sum = 0.0f;
-    for (int i = lane_id; i < C; i += warp_size) {
+    for (int i = lane_id; i < C; i += WARP_SIZE) {
         sum += (float)x[i];
     }
     sum = warpReduceSum(sum);
@@ -610,7 +674,7 @@ __global__ void layernorm_forward_kernel3(floatX* __restrict__ out, floatX* __re
 
     // rstd
     sum = 0.0f;
-    for (int i = lane_id; i < C; i += warp_size) {
+    for (int i = lane_id; i < C; i += WARP_SIZE) {
         float diff = (float)x[i] - m;
         sum += diff * diff;
     }
@@ -622,7 +686,7 @@ __global__ void layernorm_forward_kernel3(floatX* __restrict__ out, floatX* __re
 
     // final normalization and scaling by weight/bias
     floatX* o = out + idx * C;
-    for (int c = lane_id; c < C; c += warp_size) {
+    for (int c = lane_id; c < C; c += WARP_SIZE) {
         // load and store using the .cs "streaming" hint to the compiler,
         // indicating that this data will not be reused soon, and can be streamed through the caches
         // this allows the threads to get more cache-hits for the (shared) weight and bias parameters
@@ -631,6 +695,86 @@ __global__ void layernorm_forward_kernel3(floatX* __restrict__ out, floatX* __re
     }
 }
 
+__global__ void fused_residual_forward_kernel5(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                                               const floatX* inp1, const floatX* inp2,
+                                               const floatX* weight, const floatX* bias,
+                                               int N, int C) {
+    assert(blockDim.x == WARP_SIZE);
+
+    // load weights and biases into shared memory
+    // do this before we allow any threads to exit!
+    extern __shared__ char* params[];
+    // load128/store128 sometimes generated multiple instructions when the types here were floatX*, so
+    // let's keep everything as x128
+    x128* s_weight = reinterpret_cast<x128*>(params);
+    x128* s_bias = reinterpret_cast<x128*>(params) + (C / x128::size);
+    x128* s_res = reinterpret_cast<x128*>(params) + ((2 + threadIdx.y) * C / x128::size);
+
+    int sidx = (threadIdx.x + WARP_SIZE * threadIdx.y) * x128::size;
+    for(int i = sidx; i < C; i += blockDim.y * WARP_SIZE * x128::size) {
+        s_weight[i/x128::size] = load128(weight + i);
+        s_bias[i/x128::size] = load128(bias + i);
+    }
+    __syncthreads();
+
+    int idx = blockIdx.x * blockDim.y + threadIdx.y;
+    if(idx > N) return;
+
+    // adjust pointers to current token
+    residual += C * idx;
+    normed += C * idx;
+    inp1 += C * idx;
+    inp2 += C * idx;
+
+    const float eps = 1e-5f;
+    float sum = 0.0f;
+    for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) {
+        const x128 in1 = load128cs(inp1 + c);
+        const x128 in2 = load128cs(inp2 + c);
+        x128 out;
+        for(int k = 0; k < x128::size; ++k) {
+            out[k] = (float)in1[k] + (float)in2[k];
+            sum += (float)out[k];
+        }
+        store128cs(residual + c, out);
+        s_res[c / x128::size] = out;
+    }
+
+    sum = warpReduceSum(sum);
+    float m = sum / C;
+    float v = 0.f;
+
+    for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) {
+        const x128 res = s_res[c / x128::size];
+        for(int k = 0; k < x128::size; ++k) {
+            v += ((float)res[k] - m) * ((float)res[k] - m);
+        }
+    }
+
+    v = warpReduceSum(v) / C;
+    float s = rsqrtf(v + eps);
+
+    for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) {
+        const x128 res = s_res[c / x128::size];
+        const x128 w = s_weight[c / x128::size];
+        const x128 b = s_bias[c / x128::size];
+        x128 out;
+        for(int k = 0; k < x128::size; ++k) {
+            float n = s * ((float)res[k] - m); // normalized output
+            float o = n * (float)w[k] + (float)b[k]; // scale and shift it
+            out[k] = o;
+        }
+
+        store128cs(normed + c, out);
+    }
+    // cache the mean and rstd for the backward pass later
+    if(threadIdx.x == 0) {
+        mean[idx] = m;
+        rstd[idx] = s;
+    }
+}
+
+
 // inputs floatX, outputs FP32 (for current FP32-only activation path for this WIP)
 __global__ void permute_kernel(floatX* q, floatX* k, floatX* v,
                                const floatX* inp,
@@ -638,38 +782,38 @@ __global__ void permute_kernel(floatX* q, floatX* k, floatX* v,
     // okay so now, this kernel wants Q,K,V to all be of shape (B, NH, N, d)
     // but instead, we have a single tensor QKV (inp) of shape (B, N, 3, NH, d)
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= B * NH * N * d) { return; }
+
     // Q[b][nh_][n][d_] = inp[b][n][0][nh_][d_]
-    if (idx < B * NH * N * d) {
-        int b = idx / (NH * N * d);
-        int rest = idx % (NH * N * d);
-        int nh_ = rest / (N * d);
-        rest = rest % (N * d);
-        int n = rest / d;
-        int d_ = rest % d;
-        int inp_idx = (b * N * 3 * NH * d) + (n * 3 * NH * d) + (0 * NH * d) + (nh_ * d) + d_;
-        q[idx] = __ldcs(&inp[inp_idx]);
-        k[idx] = __ldcs(&inp[inp_idx + NH * d]);
-        v[idx] = __ldcs(&inp[inp_idx + 2 * (NH * d)]);
-    }
+    int b = idx / (NH * N * d);
+    int rest = idx % (NH * N * d);
+    int nh_ = rest / (N * d);
+    rest = rest % (N * d);
+    int n = rest / d;
+    int d_ = rest % d;
+    int inp_idx = (b * N * 3 * NH * d) + (n * 3 * NH * d) + (0 * NH * d) + (nh_ * d) + d_;
+    q[idx] = __ldcs(&inp[inp_idx]);
+    k[idx] = __ldcs(&inp[inp_idx + NH * d]);
+    v[idx] = __ldcs(&inp[inp_idx + 2 * (NH * d)]);
 }
 
 __global__ void permute_kernel_backward(floatX* dinp,
                                         const floatX* dq, const floatX* dk, const floatX* dv,
                                         int B, int N, int NH, int d) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < B * NH * N * d) {
-        int b = idx / (NH * N * d);
-        int rest = idx % (NH * N * d);
-        int nh_ = rest / (N * d);
-        rest = rest % (N * d);
-        int n = rest / d;
-        int d_ = rest % d;
+    if (idx >= B * NH * N * d) { return; }
 
-        int inp_idx = (b * N * 3 * NH * d) + (n * 3 * NH * d) + (0 * NH * d) + (nh_ * d) + d_;
-        dinp[inp_idx] = dq[idx];
-        dinp[inp_idx + NH * d] = dk[idx];
-        dinp[inp_idx + 2 * (NH * d)] = dv[idx];
-    }
+    int b = idx / (NH * N * d);
+    int rest = idx % (NH * N * d);
+    int nh_ = rest / (N * d);
+    rest = rest % (N * d);
+    int n = rest / d;
+    int d_ = rest % d;
+
+    int inp_idx = (b * N * 3 * NH * d) + (n * 3 * NH * d) + (0 * NH * d) + (nh_ * d) + d_;
+    dinp[inp_idx] = dq[idx];
+    dinp[inp_idx + NH * d] = dk[idx];
+    dinp[inp_idx + 2 * (NH * d)] = dv[idx];
 }
 
 __global__ void unpermute_kernel(floatX* inp, floatX *out, int B, int N, int NH, int d) {
@@ -677,30 +821,30 @@ __global__ void unpermute_kernel(floatX* inp, floatX *out, int B, int N, int NH,
 
     int idx = (blockIdx.x * blockDim.x + threadIdx.x);
     // out[b][n][nh_][d_] <- inp[b][nh_][n][d_]
-    if (idx < B * NH * N * d) {
-        int b = idx / (NH * N * d);
-        int rest = idx % (NH * N * d);
-        int nh_ = rest / (N * d);
-        rest = rest % (N * d);
-        int n = rest / d;
-        int d_ = rest % d;
-        int other_idx = (b * NH * N * d) + (n * NH * d) + (nh_ * d) + d_;
-        out[other_idx] = __ldcs(&inp[idx]);
-    }
+    if (idx >= B * NH * N * d) { return; }
+
+    int b = idx / (NH * N * d);
+    int rest = idx % (NH * N * d);
+    int nh_ = rest / (N * d);
+    rest = rest % (N * d);
+    int n = rest / d;
+    int d_ = rest % d;
+    int other_idx = (b * NH * N * d) + (n * NH * d) + (nh_ * d) + d_;
+    out[other_idx] = __ldcs(&inp[idx]);
 }
 
 __global__ void unpermute_kernel_backward(floatX* dinp, const floatX *dout, int B, int N, int NH, int d) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < B * NH * N * d) {
-        int b = idx / (NH * N * d);
-        int rest = idx % (NH * N * d);
-        int nh_ = rest / (N * d);
-        rest = rest % (N * d);
-        int n = rest / d;
-        int d_ = rest % d;
-        int other_idx = (b * NH * N * d) + (n * NH * d) + (nh_ * d) + d_;
-        dinp[idx] = (floatX)dout[other_idx];
-    }
+    if (idx >= B * NH * N * d) { return; }
+
+    int b = idx / (NH * N * d);
+    int rest = idx % (NH * N * d);
+    int nh_ = rest / (N * d);
+    rest = rest % (N * d);
+    int n = rest / d;
+    int d_ = rest % d;
+    int other_idx = (b * NH * N * d) + (n * NH * d) + (nh_ * d) + d_;
+    dinp[idx] = (floatX)dout[other_idx];
 }
 
 __global__ void softmax_forward_kernel5(floatX* out, float inv_temperature, const floatX* inp, int N, int T) {
@@ -709,10 +853,9 @@ __global__ void softmax_forward_kernel5(floatX* out, float inv_temperature, cons
     // directly autoregressive, so we only compute the lower triangular part
     // uses the online softmax algorithm
     assert(T % 4  == 0);
-    const int warp_size = 32;
-    int lane_id = threadIdx.x % warp_size;
-    int warp_id = threadIdx.x / warp_size;
-    int num_warps = blockDim.x / warp_size;
+    int lane_id = threadIdx.x % WARP_SIZE;
+    int warp_id = threadIdx.x / WARP_SIZE;
+    int num_warps = blockDim.x / WARP_SIZE;
 
     // micro-optimization: we iterate backwards so that
     // after the softmax backward operation completes, the cache retains the
@@ -730,13 +873,13 @@ __global__ void softmax_forward_kernel5(floatX* out, float inv_temperature, cons
     const floatX* x = inp + idx * T;
 
     // not INF, so we don't get NaNs accidentally when subtracting two values.
-    float maxval = -FLT_MAX;
+    const float flt_max = 340282346638528859811704183484516925440.0f; // to avoid including float.h
+    float maxval = -flt_max;
     float sumval = 0.0f;
 
     const floatX* x_aligned = reinterpret_cast<const floatX*>(__builtin_assume_aligned(x, 16));
-    for (int i = lane_id; i < pos_by_4; i += warp_size) {
+    for (int i = lane_id; i < pos_by_4; i += WARP_SIZE) {
         float regarray[4];
-        #pragma unroll
         for (int k = 0; k < 4; ++k) {
             regarray[k] = (float)x_aligned[4*i + k];
         }
@@ -764,135 +907,174 @@ __global__ void softmax_forward_kernel5(floatX* out, float inv_temperature, cons
     float norm = 1.f / sum;
 
     // divide the whole row by the sum
-    for (int i = lane_id; i <= own_pos; i += warp_size) {
+    for (int i = lane_id; i <= own_pos; i += WARP_SIZE) {
         // recalculation is faster than doing the round-trip through memory.
         float ev = expf(inv_temperature * ((float)__ldcs(x + i) - global_maxval));
         __stcs(out + idx * T + i, (floatX)(ev * norm));
     }
 }
 
-__global__ void residual_forward_kernel(floatX* out, floatX* inp1, floatX* inp2, int N) {
+__global__ void residual_forward_kernel(floatX* out, const floatX* inp1, const floatX* inp2) {
     int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
-    if (idx < N) {
-        x128 packed_out;
-        x128 packed_inp1 = load128cs(inp1 + idx);
-        x128 packed_inp2 = load128cs(inp2 + idx);
-        #pragma unroll
-        for (int k = 0; k < packed_inp1.size; k++) {
-            packed_out[k] = (floatX)((float)packed_inp1[k] + (float)packed_inp2[k]);
-        }
-        store128(out + idx, packed_out);
+
+    x128 packed_out;
+    x128 packed_inp1 = load128cs(inp1 + idx);
+    x128 packed_inp2 = load128cs(inp2 + idx);
+    for (int k = 0; k < packed_inp1.size; k++) {
+        packed_out[k] = (floatX)((float)packed_inp1[k] + (float)packed_inp2[k]);
     }
+    store128(out + idx, packed_out);
 }
 
 #define GELU_SCALING_FACTOR sqrtf(2.0f / M_PI)
-__global__ void gelu_forward_kernel2(floatX* out, const floatX* inp, int N) {
-    int i = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
-    if (i < N) {
-        x128 packed_out;
-        x128 packed_inp = load128cs(inp + i); // load and do not keep in cache
-        for(int k = 0; k < packed_inp.size; ++k) {
-            float xi = (float)packed_inp[k];
-            float cube = 0.044715f * xi * xi * xi;
-            packed_out[k] = (floatX)(0.5f * xi * (1.0f + tanhf(GELU_SCALING_FACTOR * (xi + cube))));
-        }
-        // store instead of storecs (without cache streaming) in case it is useful for the
-        // data to be in the cache for the next operation after this GeLU
-        store128(out + i, packed_out);
-    }
-}
-
-__global__ void gelu_backward_kernel(floatX* dinp, const floatX* inp, const floatX* dout, const int N) {
-    int i = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
-    if (i < N) {
-        x128 packed_dinp;
-        x128 packed_inp = load128cs(inp + i);
-        x128 packed_dout = load128cs(dout + i);
-        for (int k = 0; k < packed_inp.size; ++k) {
-            float x = (float)packed_inp[k];
-            float cube = 0.044715f * x * x * x;
-            float tanh_arg = GELU_SCALING_FACTOR * (x + cube);
-            float tanh_out = tanhf(tanh_arg);
-            float coshf_out = coshf(tanh_arg);
-            float sech_out = 1.0f / (coshf_out * coshf_out);
-            float local_grad = 0.5f * (1.0f + tanh_out) + x * 0.5f * sech_out * GELU_SCALING_FACTOR * (1.0f + 3.0f * 0.044715f * x * x);
-            packed_dinp[k] = (floatX)(local_grad * (float)packed_dout[k]);
-        }
+__global__ void gelu_forward_kernel2(floatX* out, const floatX* inp) {
+    int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
 
-        store128(dinp + i, packed_dinp);
+    x128 packed_out;
+    x128 packed_inp = load128cs(inp + idx); // load and do not keep in cache
+    for(int k = 0; k < packed_inp.size; ++k) {
+        float xi = (float)packed_inp[k];
+        float cube = 0.044715f * xi * xi * xi;
+        packed_out[k] = (floatX)(0.5f * xi * (1.0f + tanhf(GELU_SCALING_FACTOR * (xi + cube))));
     }
+    // store instead of storecs (without cache streaming) in case it is useful for the
+    // data to be in the cache for the next operation after this GeLU
+    store128(out + idx, packed_out);
 }
 
-__global__ void matmul_backward_bias_kernel6(float* dbias, const floatX* dout, int B, int T, int OC) {
-    // note: this kernel reads in floatX, but it writes to float!
-    // this is because we're using atomics, which are super slow in < fp32 precision on < H100 GPUs
-    // so the trick is do fp32 atomics to a buffer, and then copy_and_cast the result to floatX
-
-    // Each warp is responsible for 32 * "x128::size" = 256 OCs at BF16 (OC must be a multiple of 256!)
-    // Block size is 512 threads (16 warps) and we reduce those 16 values into 1 at the end
-    // blockDim.x is 32 --> single warp being responsible for those 256 OCs
-    // blockDim.y is 16 --> 16 parallel independent warps processing the same OCs for different BTs
-    // gridDim.x is OC / 256 --> each block processes 256 OCs
-    // grimDim.y is max(1, (cuda_num_SMs * cuda_threads_per_SM) / (512 * gridDim.x)); --> fill up the entire GPU!
-    const int block_size = 512;
-    const int block_size_x = 32;
-    const int block_size_y = block_size / block_size_x; // 16
-    const int OC_per_warp = block_size_x * x128::size;  // 256 at BF16
+__global__ void gelu_backward_kernel(floatX* dinp, const floatX* inp, const floatX* dout) {
+    int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
 
-    int local_oc = threadIdx.x * x128::size;
+    x128 packed_dinp;
+    x128 packed_inp = load128cs(inp + idx);
+    x128 packed_dout = load128cs(dout + idx);
+    for (int k = 0; k < packed_inp.size; ++k) {
+        float x = (float)packed_inp[k];
+        float cube = 0.044715f * x * x * x;
+        float tanh_arg = GELU_SCALING_FACTOR * (x + cube);
+        float tanh_out = tanhf(tanh_arg);
+        float coshf_out = coshf(tanh_arg);
+        float sech_out = 1.0f / (coshf_out * coshf_out);
+        float local_grad = 0.5f * (1.0f + tanh_out) + x * 0.5f * sech_out * GELU_SCALING_FACTOR * (1.0f + 3.0f * 0.044715f * x * x);
+        packed_dinp[k] = (floatX)(local_grad * (float)packed_dout[k]);
+    }
+    store128(dinp + idx, packed_dinp);
+}
+
+template<typename OutFloat, bool UseAuxBuffer>
+__global__ void matmul_backward_bias_kernel9(OutFloat* dbias, const floatX* dout, int B, int T, int OC,
+                                             std::bool_constant<UseAuxBuffer>) {
+    constexpr const int bdx = 4;
+    constexpr const int bdy = WARP_SIZE / bdx;
+    assert(blockDim.x == bdx);
+    assert(blockDim.y == bdy);
+
+    int warp_d = (int)threadIdx.x;
+    int warp_c = (int)threadIdx.y;
+    int block_d = (int)threadIdx.z;
+
+    const int OC_per_warp = bdy * x128::size;  // 64 at BF16
+
+    int local_oc = warp_c * x128::size;
     int global_oc = blockIdx.x * OC_per_warp + local_oc;
-    float accumulators[x128::size];
-    __shared__ float shared[OC_per_warp];
 
+    int local_bt = warp_d + bdx * block_d;
+    int bt_per_block = bdx * blockDim.z;
+
+    float accumulators[x128::size];
     for (int k = 0; k < x128::size; k++) {
         accumulators[k] = 0.0f;
     }
-    int thread_id = threadIdx.y * block_size_x + threadIdx.x;
-    for (int i = thread_id; i < OC_per_warp; i += block_size) {
-        shared[i] = 0.0f;
-    }
-    __syncthreads();
-    for (int i = blockIdx.y*block_size_y + threadIdx.y; i < B * T; i += gridDim.y*block_size_y) {
-        x128 packed_dout = load128(dout + global_oc + i*OC);
-        for (int k = 0; k < x128::size; k++) {
-            //printf("%d: %f + %f\n", oc, accumulators[k], (float)packed_dout[k]);
-            accumulators[k] += (float)packed_dout[k];
+
+    if(global_oc < OC) {
+        // sum up over all bt within registers
+        for (int idx = blockIdx.y * bt_per_block + local_bt; idx < B * T; idx += gridDim.y * bt_per_block) {
+            x128 packed_dout = load128(dout + global_oc + idx*OC);
+            for (int k = 0; k < x128::size; k++) {
+                accumulators[k] += (float)packed_dout[k];
+            }
         }
-        //__syncthreads(); // keep block synchronised to maximise memory locality (?)
     }
+
+    __shared__ float sub_results[x128::size][WARP_SIZE][bdy];
+
+    // reduce within-warp results
     for (int k = 0; k < x128::size; k++) {
-        atomicAdd(shared + local_oc + k, accumulators[k]);
+        float v = accumulators[k];
+        v += __shfl_down_sync(0xffffffff, v, 1, 4);
+        v += __shfl_down_sync(0xffffffff, v, 2, 4);
+        if(warp_d == 0) {
+            sub_results[k][block_d][warp_c] = v;
+        }
     }
     __syncthreads();
-    if (threadIdx.y == 0) {
-        for (int i = threadIdx.x; i < OC_per_warp; i += block_size_x) {
-            //printf("%d => %f\n", i, shared[i]);
-            atomicAdd(dbias + i + blockIdx.x*OC_per_warp, shared[i]);
+
+    // block-wide reductions
+    for (int k = block_d; k < x128::size; k += blockDim.z) {
+        float a = 0.f;
+        for (int r = warp_d; r < blockDim.z; r += bdx) {
+            float v = sub_results[k][r][warp_c];
+            v += __shfl_down_sync(0xffffffff, v, 1, 4);
+            v += __shfl_down_sync(0xffffffff, v, 2, 4);
+            a += v;
+        }
+        if(warp_d == 0 && global_oc < OC) {
+            if constexpr (!UseAuxBuffer) {
+                dbias[global_oc + k] = (OutFloat)(a + (float)dbias[global_oc + k]);
+            } else {
+                dbias[global_oc + k + blockIdx.y * OC] = a;
+            }
         }
     }
 }
 
-__global__ void layernorm_backward_kernel7(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
-                        const floatX* dout, const floatX* inp, const floatX* weight, const floatX* mean, const floatX* rstd,
-                        int B, int T, int C) {
-    extern __shared__ float shared[]; // size = 2 * C + 1
-    int warpId = threadIdx.x / warpSize; // warp index within a block
-    int warpsInBlock = blockDim.x / warpSize; //number of warps in block
+__global__ void reduce_add_sum_kernel(floatX* dst, const float* src, size_t n, size_t m) {
+    const size_t idx = (blockIdx.x * blockDim.x + threadIdx.x) * f128::size;
+    assert(n % x128::size == 0);
+    if (idx < n) {
+        f128 acc;
+        for(int k = 0; k < f128::size; ++k) {
+            acc[k] = 0.f;
+        }
+
+        for(int l = 0; l < m; ++l) {
+            f128 s = load128(src + idx + n * l);
+            for(int k = 0; k < f128::size; ++k) {
+                acc[k] += s[k];
+            }
+        }
+        for(int k = 0; k < f128::size; ++k) {
+            dst[idx + k] = (floatX) ((float)dst[idx + k] + acc[k]);
+        }
+    }
+}
+
+__global__ void __launch_bounds__(512, 2) // todo - any warnings on Turing with only 1024 threads?
+    layernorm_backward_kernel9(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
+                                const floatX* dout, const floatX* inp, const floatX* weight,
+                                const floatX* mean, const floatX* rstd,
+                                int B, int T, int C) {
+    extern __shared__ float shared[]; // size = 2*C + 2*block_size + 1
+    int warpsInBlock = blockDim.x / WARP_SIZE; //number of warps in block
+    int warpId = threadIdx.x / WARP_SIZE; // warp index within a block
     int baseIdx = blockIdx.x * warpsInBlock + warpId;
-    int warpThreadIdx = threadIdx.x % warpSize; // Thread index within the warp
+    int warpThreadIdx = threadIdx.x % WARP_SIZE; // Thread index within the warp
     int warpsInGrid = gridDim.x * warpsInBlock;
+    int C_per_iteration = WARP_SIZE * x128::size;
+    int iterations_C = CEIL_DIV(C, C_per_iteration);
 
     // the first half of shared memory is bias, second is weight
     float* dbias_shared = shared;
     float* dweight_shared = shared + C;
+    float* dbias_tmp_shared = shared + 2 * C;
+    float* dweight_tmp_shared = shared + 2 * C + blockDim.x;
 
     // init shared memory to zero
-    #pragma unroll 4
     for(int i = threadIdx.x; i < C; i+= blockDim.x){
        dbias_shared[i] = 0.0f;
        dweight_shared[i] = 0.0f;
     }
-    unsigned int *tmp_flag = (unsigned int*)(shared + C*2);
+    unsigned int *tmp_flag = (unsigned int*)(shared + 2*C + 2*blockDim.x);
     __syncthreads();
 
     for (int idx = baseIdx; idx < B * T; idx += warpsInGrid) {
@@ -908,57 +1090,139 @@ __global__ void layernorm_backward_kernel7(floatX* dinp, floatX* dweight, floatX
         // first: two reduce operations
         float dnorm_mean = 0.0f;
         float dnorm_norm_mean = 0.0f;
-        for (int i = warpThreadIdx; i < C; i  += warpSize) {
-            float norm_bti = ((float)inp_bt[i] - mean_bt) * rstd_bt;
-            float dnorm_i = (float)weight[i] * (float)dout_bt[i];
-            dnorm_mean += dnorm_i;
-            dnorm_norm_mean += dnorm_i * norm_bti;
+        for (int i = warpThreadIdx * x128::size; i < C; i += WARP_SIZE * x128::size) {
+            x128 dout128_i   = load128(dout_bt + i);
+            x128 inp128_i    = load128(inp_bt  + i);
+            x128 weight128_i = load128(weight  + i);
+            for (int k = 0; k < x128::size; k++) {
+                float norm_bti = ((float)inp128_i[k] - mean_bt) * rstd_bt;
+                float dnorm_i = (float)weight128_i[k] * (float)dout128_i[k];
+                dnorm_mean += dnorm_i;
+                dnorm_norm_mean += dnorm_i * norm_bti;
+            }
         }
-        dnorm_mean = warpReduceSum(dnorm_mean);
-        dnorm_norm_mean = warpReduceSum(dnorm_norm_mean);
-        dnorm_mean = dnorm_mean / C;
-        dnorm_norm_mean = dnorm_norm_mean / C;
+        dnorm_mean = warpReduceSum(dnorm_mean) / C;
+        dnorm_norm_mean = warpReduceSum(dnorm_norm_mean) / C;
 
         // now iterate again and accumulate all the gradients
-        for (int i = warpThreadIdx; i < C; i += warpSize) {
-            float dout_i = (float)__ldcs(&dout_bt[i]);
-            float norm_bti = ((float)__ldcs(&inp_bt[i]) - mean_bt) * rstd_bt;
-            float dnorm_i = (float)weight[i] * dout_i;
-            // gradient contribution to bias
-            atomicAdd(&dbias_shared[i], dout_i);
-            // gradient contribution to weight
-            atomicAdd(&dweight_shared[i], norm_bti * dout_i);
-            // gradient contribution to input
-            float dval = 0.0f;
-            dval += dnorm_i; // term 1
-            dval -= dnorm_mean; // term 2
-            dval -= norm_bti * dnorm_norm_mean; // term 3
-            dval *= rstd_bt; // final scale
-            dinp_bt[i] = (floatX)((float)dinp_bt[i] + dval);
+        // unfortunately we cannot use the same index for x128 arrays and shared memory
+        // as atomics can only be 32-bit rather than 128-bit (at least pre-SM90/Hopper)
+        // so this would result in an 8-way bank conflict, and kill performance
+        // so instead, we use a shared memory friendly index, and reorder before the final write
+        for (int i = 0; i < iterations_C; i++) {
+            int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration);
+            int shared_index = warpThreadIdx + (i * C_per_iteration);
+            if (global_index >= C) {
+                break;
+            }
+
+            x128 dout128   = load128cs(dout_bt + global_index);
+            x128 inp128    = load128cs(inp_bt  + global_index);
+            x128 dinp128   = load128(dinp_bt   + global_index);
+            x128 weight128 = load128(weight    + global_index);
+
+            for (int x = 0; x < x128::size; x++) {
+                float dout_i = (float)dout128[x];
+                float norm_bti = ((float)inp128[x] - mean_bt) * rstd_bt;
+                float dnorm_i = (float)weight128[x] * dout_i;
+
+                // sum up the gradients for bias and weight across the entire block
+                // this is basically a reduction (but only inter-warp, not intra-warp)
+                // doing it this way allows us to avoid using atomics while using many warps
+                if (warpId != 0) {
+                    dbias_tmp_shared[threadIdx.x] = dout_i;
+                    dweight_tmp_shared[threadIdx.x] = norm_bti * dout_i;
+                }
+                __syncthreads();
+                if (warpId == 0) {
+                    float dbias_tmp = dout_i;
+                    float dweight_tmp = norm_bti * dout_i;
+                    for (int j = 1; j < warpsInBlock; j++) {
+                        dbias_tmp += dbias_tmp_shared[threadIdx.x + j * WARP_SIZE];
+                        dweight_tmp += dweight_tmp_shared[threadIdx.x + j * WARP_SIZE];
+                    }
+                    // gradient contribution to bias (using shared memory friendly index)
+                    dbias_shared[shared_index + x*WARP_SIZE] += dbias_tmp;
+                    // gradient contribution to weight (using shared memory friendly index)
+                    dweight_shared[shared_index + x*WARP_SIZE] += dweight_tmp;
+                }
+                __syncthreads();
+
+                // gradient contribution to input
+                float dval = 0.0f;
+                dval += dnorm_i; // term 1
+                dval -= dnorm_mean; // term 2
+                dval -= norm_bti * dnorm_norm_mean; // term 3
+                dval *= rstd_bt; // final scale
+                dinp128[x] = (floatX)((float)dinp128[x] + dval);
+            }
+            // cache in L2 as this is read by the next kernel, but bypass L1 to minimise thrashing
+            store128cg(dinp_bt + global_index, dinp128);
         }
     }
-
-    // Accumulate into a FP32 scratchpad
-    // BF16 atomics are potentially much slower... and this is more precise!
-    // todo - could potentially avoid the extra copy if floatX is FP32, fairly negligible though
     __syncthreads();
+    // Each block writes its partial sum to global memory
+    // The last block to finish becomes responsible for summing up all the partial sums
+    // This is done by atomically incrementing a flag (cleared to 0 before launching the kernel)
+    unsigned int* scratchFlag = (unsigned int*)(scratch);
+    // Increment scratch pointer by a full cacheline so that everything remains cacheline aligned
+    scratch += 32;
     float* scratch_dbias = scratch;
     float* scratch_dweight = scratch + C;
-    unsigned int* scratchFlag = (unsigned int*)(scratch + (2 * C));
     for(int i = threadIdx.x; i < C; i+= blockDim.x) {
-        atomicAdd(&scratch_dbias[i], dbias_shared[i]);
-        atomicAdd(&scratch_dweight[i], dweight_shared[i]);
+        // Write to global memory in the same "shared memory banking friendly" order
+        scratch_dbias[i + 2*C*blockIdx.x] = dbias_shared[i];
+        scratch_dweight[i + 2*C*blockIdx.x] = dweight_shared[i];
     }
+
+    // todo - everything below could become a separate kernel for better performance with maybe less code
+    // not enough parallelism even inside that single SM... do we need another level of reduction?!
     __syncthreads();
     if (threadIdx.x == 0) {
-        *tmp_flag = atomicAdd(scratchFlag, 1);
+        *tmp_flag = atomicInc(scratchFlag, gridDim.x);
     }
     __syncthreads();
     if (*tmp_flag == gridDim.x-1) {
-        for(int i = threadIdx.x; i < C; i+= blockDim.x) {
-            // todo - potentially do stochastic rounding here as well
-            dbias[i] = (floatX)scratch_dbias[i];
-            dweight[i] = (floatX)scratch_dweight[i];
+        // Reduction of the partial sums by the final block
+        for(int i = threadIdx.x * f128::size; i < C; i+= blockDim.x * f128::size) {
+            f128 dbias_accum(make_int4(0, 0, 0, 0));
+            f128 dweight_accum(make_int4(0, 0, 0, 0));
+
+            for (int read_block_idx = 0; read_block_idx < gridDim.x; read_block_idx++) {
+                int offset = i + 2*C*read_block_idx;
+                f128 dbias128 = load128(scratch_dbias + offset);
+                f128 dweight128 = load128(scratch_dweight + offset);
+                for(int k = 0; k < f128::size; k++) {
+                    dbias_accum[k] += dbias128[k];
+                    dweight_accum[k] += dweight128[k];
+                }
+            }
+            store128(dbias_shared + i, dbias_accum);
+            store128(dweight_shared + i, dweight_accum);
+        }
+        __syncthreads();
+
+        // reorder from atomic/shared memory-friendly index to real global memory index
+        // and convert from float/FP32 to floatX/BF16 for the final write
+        // this is separate also because it cannot use as many warps as the above (f128 vs x128)
+        // todo - if we split this code into another kernel, we could maybe do it at the same time?
+        for (int i = warpId; i < iterations_C; i += warpsInBlock) {
+            int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration);
+            int shared_index = warpThreadIdx + (i * C_per_iteration);
+            if (global_index >= C) {
+                break;
+            }
+
+            x128 dbias128 = load128(dbias + global_index);
+            x128 dweight128 = load128(dweight + global_index);
+            for (int x = 0; x < x128::size; x++) {
+                float s_db = dbias_shared[shared_index + x*WARP_SIZE];
+                float s_dw = dweight_shared[shared_index + x*WARP_SIZE];
+                dbias128[x] = (floatX)(s_db + (float)dbias128[x]);
+                dweight128[x] = (floatX)(s_dw + (float)dweight128[x]);
+            }
+            store128(dbias + global_index, dbias128);
+            store128(dweight + global_index, dweight128);
         }
     }
 }
@@ -968,9 +1232,9 @@ __global__ void softmax_autoregressive_backward_kernel(floatX* dpreatt, const fl
     constexpr const int BlockSize = 256;
     constexpr int T_per_block = 4;
 
-    int idx = blockIdx.y;
     // go through blocks in reverse order, so the slowest block starts first
     int t0 = T - 1 - T_per_block*blockIdx.x;
+    int idx = blockIdx.y;
 
     att += idx * T * T;
     datt += idx * T * T;
@@ -1001,41 +1265,62 @@ __global__ void softmax_autoregressive_backward_kernel(floatX* dpreatt, const fl
 
 // Implements linear interpolation using only two floating-point operations (as opposed to three in a naive implementation).
 // Reference: https://developer.nvidia.com/blog/lerp-faster-cuda
-__device__ inline float lerp(float start, float end, float weight) {
+__device__ float lerp(float start, float end, float weight) {
     return fma(weight, end, fma(-weight, start, start));
 }
 
-// Termplate type T instead of floatx
 template <typename Tp, typename Tg>
 __global__ void adamw_kernel3(Tp* params_memory, float* master_params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters,
                               float learning_rate, float beta1, float beta2, float beta1_correction, float beta2_correction, float eps, float weight_decay,
-                              unsigned int seed) {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= num_parameters) return;  // guard
+                              float grad_scale, unsigned int seed) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= num_parameters) { return; }  // guard
+
     // get the gradient, m, and v for this parameter
-    float grad = (float)grads_memory[i];
-    float m = m_memory[i];
-    float v = v_memory[i];
+    float grad = grad_scale * (float)grads_memory[idx];
+    float m = m_memory[idx];
+    float v = v_memory[idx];
     // update the first moment (momentum)
     m = lerp(grad, m, beta1);
-    m_memory[i] = m;
+    m_memory[idx] = m;
     // update the second moment (RMSprop)
     v = lerp(grad * grad, v, beta2);
-    v_memory[i] = v;
+    v_memory[idx] = v;
     m /= beta1_correction;  // m_hat
     v /= beta2_correction;  // v_hat
     // fetch the old value of this parameter as a float, from either source
-    float old_param = (master_params_memory != NULL) ? master_params_memory[i] : (float)params_memory[i];
+    float old_param = (master_params_memory != NULL) ? master_params_memory[idx] : (float)params_memory[idx];
     // update this parameter
     float param = old_param - (learning_rate * (m / (sqrtf(v) + eps) + weight_decay * old_param));
     // update our low precision version of the parameters using stochastic rounding
     // this will be used in the next forward pass
     // TODO: simply doing `params_memory[i] = (floatX)param;` breaks everything (why?)
     unsigned int random = Get2dNoiseUint(threadIdx.x, blockIdx.x, seed);
-    stochastic_rounding(param, &params_memory[i], random);
+    stochastic_rounding(param, &params_memory[idx], random);
     // write the full, float version of the param into our master copy, if we maintain one
     // this will be used in the next update
-    if (master_params_memory != NULL) { master_params_memory[i] = param; }
+    if (master_params_memory != NULL) { master_params_memory[idx] = param; }
+}
+
+template<class T>
+__global__ void global_norm_squared_kernel(float* out, const T* data, size_t count) {
+    // we want as few atomics as possible, so each block tries to do
+    // the maximum amount of work (so no fixed chunk, but instead iterating
+    // until we run out of data), and then we reduce inside the block
+    // and finally have just one atomic per block.
+    // out will be updated atomically from all thread blocks. It is a float, so the
+    // atomic op is unproblematic
+    size_t index = threadIdx.x + blockDim.x * blockIdx.x;
+    size_t grid_width = blockDim.x * gridDim.x;
+    float accumulator = 0.f;
+    for(size_t i = index; i < count; i += grid_width) {
+        accumulator += (float)data[i] * (float)data[i];
+    }
+    // warp-level reduce
+    float block_sum = blockReduce<warpReduceSum>(accumulator);
+    if(threadIdx.x == 0) {
+        atomicAdd(out, block_sum);
+    }
 }
 
 struct SoftmaxParams {
@@ -1043,21 +1328,35 @@ struct SoftmaxParams {
     float Offset;
 };
 
-__device__ SoftmaxParams prepare_softmax_blockwide(int idx, const floatX* inp, int V, int P) {
+__device__ SoftmaxParams prepare_softmax_blockwide3(int64_t idx, const floatX* inp, int V, int P) {
     // same but not float4
     // one row of inp, i.e. inp[idx, :] of shape (V,)
 
     const floatX* x = inp + idx * P;
     float thread_maxval = -INFINITY;
     float thread_sumval = 0.0f;
-    // do the loop in reverse to maximise probability of L2 cache hits
-    // so even small L2s get some hits on the 2nd read of the same thread
-    for (int i = (V+x128::size-1)/x128::size + threadIdx.x - blockDim.x; i >= 0; i -= blockDim.x) {
-        x128 packed_x = load128(x + i * x128::size); // try to keep in cache until next read
-        for(int k = 0; k < packed_x.size; ++k) {
-            if (i*x128::size+k >= V) {  // bounds checking against real V
-                continue;
+    int i = (V+x128::size-1)/x128::size + threadIdx.x - blockDim.x;
+
+    // special-case loop to handle the unaligned elements at the end of the array
+    // this lets us skip the bounds check in the main loop below, which improves performance
+    while ((i+1)*x128::size > V) {
+        for(int k = 0; k < x128::size; ++k) {
+            if (i*x128::size+k >= V) {
+                break; // bounds checking against real V (rather than padded P)
             }
+            float v = (float)x[i*x128::size+k];
+            float old_maxval = thread_maxval;
+            thread_maxval = fmaxf(thread_maxval, v);
+            thread_sumval *= expf((old_maxval - thread_maxval));
+            thread_sumval += expf(v - thread_maxval);
+        }
+        i -= blockDim.x;
+    }
+
+    // main loop for the bulk of the iterations (no bounds checking required!)
+    for (; i >= 0; i -= blockDim.x) {
+        x128 packed_x = load128(x + i * x128::size); // load and keep in cache until fused_classifier loop
+        for(int k = 0; k < x128::size; ++k) {
             float v = (float)packed_x[k];
             float old_maxval = thread_maxval;
             thread_maxval = fmaxf(thread_maxval, v);
@@ -1067,7 +1366,7 @@ __device__ SoftmaxParams prepare_softmax_blockwide(int idx, const floatX* inp, i
     }
 
     // Block Max Reduction -> Maths -> Block Sum Reduction
-    float block_maxval = blockReduce<warpReduceMax>(thread_maxval);
+    float block_maxval = blockReduce<warpReduceMax>(thread_maxval, false, -INFINITY);
     thread_sumval *= expf(thread_maxval - block_maxval);
     float block_sumval = blockReduce<warpReduceSum>(thread_sumval);
 
@@ -1075,16 +1374,22 @@ __device__ SoftmaxParams prepare_softmax_blockwide(int idx, const floatX* inp, i
     return SoftmaxParams{1.f / block_sumval, block_maxval};
 }
 
-// same as 2 but not using float4 (see dev/cuda/classifier_fused.cu)
 // will _update_ logits to logit gradients
-__global__ void fused_classifier_kernel3(floatX* logits, floatX* losses, floatX* probs,
-                                         const floatX* dlosses, const int* targets,
+// uses template to decide whether to write logits and probs
+// split both loops in "multiple-of-x128-size" and "bounds-checked remainder" parts
+template <bool WriteLogits = true, bool WriteProbs = false>
+__global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS)
+                fused_classifier_kernel5(floatX* logits, floatX* losses, floatX* probs,
+                                         const float dloss, const int* targets,
                                          int B, int T, int V, int P) {
-    int idx = gridDim.x - (blockIdx.x+1); // reverse order for cache hits on matmul data
+    // note: idx is small enough that it easily fits into 32 bit;
+    // by making it a long here, we ensure that any offsets calculated with it (e.g., idx * P)
+    // are done is 64 bit
+    int64_t idx = gridDim.x - (blockIdx.x+1); // reverse order for cache hits on matmul data
     int ix = targets[idx];
 
     // softmax (reading B * T * V, same logits read again below, hopefully still in cache)
-    SoftmaxParams sp = prepare_softmax_blockwide(idx, logits, V, P);
+    SoftmaxParams sp = prepare_softmax_blockwide3(idx, logits, V, P);
 
     // calculate the probability needed for the loss and update (single-threaded)
     if(threadIdx.x == 0) {
@@ -1092,47 +1397,73 @@ __global__ void fused_classifier_kernel3(floatX* logits, floatX* losses, floatX*
         losses[idx] = (floatX)(-logf(prob));
     }
 
-    // very sensible default for dlosses is 1/(B*T), which is the uniform loss
-    float dloss = (dlosses != NULL) ? (float)dlosses[idx] : 1.0f / (B*T);
     // calculate the gradients directly, saves bandwidth from probs during training
     // but also supports writing probs for inference-only and debugging
     const floatX* logits_vec = logits + idx * P;
-    for (int i = threadIdx.x; i < (V+x128::size-1)/x128::size; i += blockDim.x) {
+    for (int i = threadIdx.x; i < V/x128::size; i += blockDim.x) {
         // this is the 2nd read of logits after the one in prepare_softmax2
-        // this data will never be needed again, so we reduce cache persistence
-        x128 packed_logits_vec = load128cs(logits_vec + i * x128::size); // load and do not keep in cache
+        // it will be overwritten by the logits gradients which is when we reduce cache persistence
+        x128 packed_logits_vec = load128(logits_vec + i * x128::size); // rely on cs of store128cs
         x128 packed_probs;
-        x128 packed_logits;
-        for(int k = 0; k < packed_logits_vec.size; ++k) {
-            int element = i*packed_logits_vec.size + k;
-            if (element >= V) {  // bounds checking against real V
-                continue;
-            }
-            float v = (float)packed_logits_vec[k];
-            float prob = expf(v - sp.Offset) * sp.Scale;
+        for(int k = 0; k < x128::size; ++k) {
+            int element = i*x128::size + k;
+            float prob = expf((float)packed_logits_vec[k] - sp.Offset) * sp.Scale;
             packed_probs[k] = (floatX)prob;
             float indicator = (element == ix) ? 1.0f : 0.0f;
-            packed_logits[k] = (floatX)((prob - indicator) * dloss);
+            packed_logits_vec[k] = (floatX)((prob - indicator) * dloss);
+        }
+        if (WriteLogits){
+            // reduce cache persistence for the overwritten logits
+            // to maximise probability that logits remain in cache between prepare_softmax and here
+            store128cs(logits + idx * P + i * x128::size, packed_logits_vec);
+        }
+        if (WriteProbs) {
+            store128(probs + idx * P + i * x128::size, packed_probs);
         }
-        if (logits != NULL){
-            store128(logits + idx * P + i * packed_logits_vec.size, packed_logits);
+    }
+
+    // handle remaining elements after the last multiple of x128::size
+    // e.g. if V = 8003, and x128::size = 8, we need to handle the last 3 elements
+    int unaligned_start = V & ~(x128::size - 1); // round down to multiple of x128::size
+    for (int i = threadIdx.x + unaligned_start; i < V; i++) {
+        float prob = expf((float)logits_vec[i] - sp.Offset) * sp.Scale;
+        float indicator = (i == ix) ? 1.0f : 0.0f;
+        float dlogit = (prob - indicator) * dloss;
+        if (WriteLogits){
+            __stcs(logits + idx * P + i, (floatX)dlogit);
         }
-        if (probs != NULL) {
-            store128(probs + idx * P + i * packed_logits_vec.size, packed_probs);
+        if (WriteProbs) {
+            probs[idx * P + i] = (floatX)prob;
         }
     }
 }
 
-__global__ void copy_and_cast_kernel(float* dst, const floatX* src, size_t n) {
-    // a small kernel to copy and cast, i.e. `dst <- (float) src`
-    const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < n) { dst[i] = (float)src[i]; }
+// device functions and the kernel to cast data between types
+template<typename Td, typename Ts>
+__device__ Td cast_value(Ts val);
+
+template<>
+__device__ float cast_value<float, float>(float val) {
+    return val;
+}
+
+template<>
+__device__ float cast_value<float, half>(half val) {
+    return __half2float(val);
 }
 
-__global__ void cast_and_add_kernel(floatX* dst, const float* src, size_t n) {
-    // used only for matmul_backward_bias kernel, a little bit embarassing TODO delete later
-    const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < n) { dst[i] += (floatX)src[i]; } // have to += because dbias is a paramater
+template<>
+__device__ float cast_value<float, __nv_bfloat16>(__nv_bfloat16 val) {
+    return __bfloat162float(val);
+}
+
+template<typename Td, typename Ts>
+__global__ void copy_and_cast_kernel(Td* dst, const Ts* src, size_t n) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    // need to try grid stride looping for more perf later
+    if (idx < n) {
+        dst[idx] = cast_value<Td, Ts>(src[idx]);
+    }
 }
 
 // ----------------------------------------------------------------------------
@@ -1145,35 +1476,89 @@ void encoder_forward(floatX* out,
     const int block_size = 256;
     const int N = B * T * C;
     const int grid_size = CEIL_DIV(N, (int)(block_size * x128::size));
-    encoder_forward_kernel3<<<grid_size, block_size, 0, main_stream>>>(out, inp, wte, wpe, B, T, C);
+    encoder_forward_kernel3<<<grid_size, block_size>>>(out, inp, wte, wpe, B, T, C);
     cudaCheck(cudaGetLastError());
 }
 
-void encoder_backward(floatX* dwte, floatX* dwpe,
-                    const floatX* dout, const int* inp,
-                    int B, int T, int C) {
+// Fully deterministic (see comments in wte_backward_kernel and wpe_backward_kernel for more details)
+void encoder_backward(floatX* dwte, floatX* dwpe, floatX* scratch, // gpu outputs & scratch
+                      int* workload_indices, int4* bucket_info,    // cpu scratch buffers
+                      const floatX* dout, const int* inp, const int* inputs_cpu, // cpu/gpu inputs
+                      int B, int T, int C, unsigned int seed) {
     NVTX_RANGE_FN();
-    const int N = B * T * C;
+
+    // Launch wpe kernel first (so it runs on the GPU in parallel with the CPU pre-processing for wte)
     const int block_size = 256;
+    const int N = T * C / x128::size;
     const int grid_size = CEIL_DIV(N, block_size);
-    encoder_backward_kernel<<<grid_size, block_size, 0, main_stream>>>(dwte, dwpe, dout, inp, B, T, C);
+    wpe_backward_kernel<<<grid_size, block_size, 0>>>(dwpe, dout, inp, B, T, C, seed);
+
+    // check the GPU scratch buffer is large enough to hold the bucket info and workload indices
+    // todo - this is trivially true given hardcoded scratch buffer size here, is this useful?
+    int num_c_groups = CEIL_DIV(C, x128::size * WARP_SIZE);
+    assert(B*T*num_c_groups * (sizeof(int4)+sizeof(int)) <= B*T*3*C * sizeof(floatX));
+
+    // Step 1: Sort inputs into buckets
+    int total_items = 0;
+    std::unordered_map<uint64_t, std::vector<uint64_t>> buckets;
+    for (uint64_t bt = 0; bt < B * T; bt++) {
+        for (uint64_t c_group = 0; c_group < num_c_groups; c_group++) {
+            // todo - passing c_group/inputs_cpu[bt] in data to avoid a second hash lookup is a bit hacky
+            uint64_t data = bt + (c_group<<32ULL) + ((uint64_t)inputs_cpu[bt]<<42ULL);
+            buckets[c_group + num_c_groups * inputs_cpu[bt]].push_back(data);
+            total_items++;
+        }
+    }
+
+    // Step 2: Sort buckets by size in descending order
+    // this is so the largest buckets are processed first by the GPU
+    // otherwise, if they started late, they would still be running with the rest of the GPU idle
+    std::vector<std::pair<uint64_t, std::vector<uint64_t>>> sortedBuckets(buckets.begin(), buckets.end());
+    std::sort(sortedBuckets.begin(), sortedBuckets.end(), // ugly because we don't have a typedef for the std::pair
+              [](const std::pair<uint64_t, std::vector<uint64_t>>& a, const std::pair<uint64_t, std::vector<uint64_t>>& b) {
+                  return a.second.size() > b.second.size();
+              });
+
+    int num_buckets = buckets.size();
+    int bucket_index = 0;
+    int workload_index = 0;
+    for (const auto& bucket : sortedBuckets) {
+        bucket_info[bucket_index].x = workload_index; // bucket start
+        bucket_info[bucket_index].y = bucket.second.size(); // bucket size
+        bucket_info[bucket_index].z = (bucket.second[0] >> 42ULL) & ((1ULL<<20ULL)-1); // bucket ix
+        bucket_info[bucket_index].w = (bucket.second[0] >> 32ULL) & ((1ULL<<10ULL)-1); // bucket c
+
+        for (uint64_t idx : bucket.second) {
+            workload_indices[workload_index++] = (int)(idx & ((1ULL<<31ULL)-1ULL));
+        }
+        bucket_index++;
+    }
+
+    // Step 3: Copy data from host to device (async until the last one to avoid synchronising CPU/GPU twice)
+    // todo - could use CUDA events (even without streams) to avoid CPU/GPU synchronisation completely
+    int4* d_bucket_info = (int4*)scratch;
+    int*  d_workload_indices = (int*)(scratch + B*T*num_c_groups * sizeof(int4));
+    cudaMemcpyAsync(d_bucket_info, bucket_info, num_buckets * sizeof(int4), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_workload_indices, workload_indices, total_items * sizeof(int), cudaMemcpyHostToDevice);
+
+    // Launch wte kernel
+    // todo - profile block sizes on more content (depends on number of buckets and on GPU?)
+    wte_backward_kernel<256><<<num_buckets, 256>>>(dwte, d_bucket_info, d_workload_indices, dout, inp, seed, B, T, C);
     cudaCheck(cudaGetLastError());
 }
 
 void layernorm_forward(floatX* out, floatX* mean, floatX* rstd,
-                       floatX* inp, floatX* weight, floatX* bias,
+                       floatX* inp, const floatX* weight, const floatX* bias,
                        int B, int T, int C) {
     NVTX_RANGE_FN();
     const int block_size = 512;
     const int N = B * T;
-    const int grid_size = CEIL_DIV(N * 32, block_size);
-    layernorm_forward_kernel3<<<grid_size, block_size, 0, main_stream>>>(out, mean, rstd, inp, weight, bias, N, C);
+    const int grid_size = CEIL_DIV(N * WARP_SIZE, block_size);
+    layernorm_forward_kernel3<<<grid_size, block_size>>>(out, mean, rstd, inp, weight, bias, N, C);
     cudaCheck(cudaGetLastError());
 }
 
-// uses cuBLASLt to fuse the bias and gelu. does not work with OC = 50257 (last layer)
 // https://docs.nvidia.com/cuda/cublas/#cublasltmatmul
-// https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuBLASLt/LtSgemm/sample_cublasLt_LtSgemm.cu
 void matmul_forward_cublaslt(floatX* out,
                      floatX* inp, floatX* weight, floatX* bias,
                      int B, int T, int C, int OC) {
@@ -1186,13 +1571,8 @@ void matmul_forward_cublaslt(floatX* out,
         exit(EXIT_FAILURE);
     }
 
-    // FP16 alpha/beta need to be used if and only if CUBLAS_COMPUTE_16F
+    // these need to be in FP16 if and only if alpha/beta are CUBLAS_COMPUTE_16F
     const float alpha = 1.0f, beta = 0.0f;
-    const half alpha_fp16 = (half)alpha, beta_fp16 = (half)beta;
-    const void* alpha_ptr = (CUBLAS_LOWP_COMPUTE == CUBLAS_COMPUTE_16F) ?
-                            (const void*)&alpha_fp16 : (const void*)&alpha;
-    const void* beta_ptr =  (CUBLAS_LOWP_COMPUTE == CUBLAS_COMPUTE_16F) ?
-                            (const void*)&beta_fp16 : (const void*)&beta;
 
     int returnedResults = 0;
     cublasLtMatmulDesc_t operationDesc;
@@ -1206,16 +1586,12 @@ void matmul_forward_cublaslt(floatX* out,
     // create the operation descriptor
     cublasOperation_t opNoTranspose = CUBLAS_OP_N;
     cublasOperation_t opTranspose = CUBLAS_OP_T;
-    cublasLtEpilogue_t epilogueBias = CUBLASLT_EPILOGUE_BIAS;
+    cublasLtEpilogue_t epilogueBias = has_bias ? CUBLASLT_EPILOGUE_BIAS : CUBLASLT_EPILOGUE_DEFAULT;
 
-    cudaDataType_t scale_type = (CUBLAS_LOWP_COMPUTE == CUBLAS_COMPUTE_16F) ? CUDA_R_16F : CUDA_R_32F;
-    cublasCheck(cublasLtMatmulDescCreate(&operationDesc, CUBLAS_LOWP_COMPUTE, scale_type));
+    cublasCheck(cublasLtMatmulDescCreate(&operationDesc, cublas_compute, CUDA_R_32F)); // FP16 if CUBLAS_COMPUTE_16F
     cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &opTranspose, sizeof(opTranspose)));
     cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opNoTranspose, sizeof(opNoTranspose)));
-    if(has_bias) {
-        cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogueBias,
-                                                   sizeof(epilogueBias)));
-    }
+    cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogueBias, sizeof(epilogueBias)));
     cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias)));
 
     // define matrix layouts
@@ -1227,8 +1603,7 @@ void matmul_forward_cublaslt(floatX* out,
     // create a preference handle with specified max workspace
     cublasCheck(cublasLtMatmulPreferenceCreate(&preference));
     cublasCheck(cublasLtMatmulPreferenceSetAttribute(preference,
-        CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
-        &cublaslt_workspace_size, sizeof(cublaslt_workspace_size)));
+        CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &cublaslt_workspace_size, sizeof(cublaslt_workspace_size)));
 
     // find a suitable algorithm
     cublasCheck(cublasLtMatmulAlgoGetHeuristic(cublaslt_handle, operationDesc,
@@ -1241,9 +1616,9 @@ void matmul_forward_cublaslt(floatX* out,
 
     // call the matmul
     cublasCheck(cublasLtMatmul(cublaslt_handle, operationDesc,
-        alpha_ptr, weight, weightLayout, inp, inputLayout, beta_ptr,
+        &alpha, weight, weightLayout, inp, inputLayout, &beta,
         out, outputLayout, out, outputLayout, &heuristic.algo,
-        cublaslt_workspace, cublaslt_workspace_size, main_stream));
+        cublaslt_workspace, cublaslt_workspace_size, 0));
 
     // cleanups
     cublasCheck(cublasLtMatmulPreferenceDestroy(preference));
@@ -1261,7 +1636,7 @@ void attention_forward(floatX* out, floatX* qkvr, floatX* att,
     // Note: `inp` is not needed for backward pass, so we re-use it as a scratch buffer.
     // Its contents will be overwritten by this function.
     const int block_size = 256;
-    const int softmax_block_size = 256;
+    const float alpha = 1.0f, beta = 0.0f;
 
     // inp is (B, T, 3C) QKV
     // preatt, att are (B, NH, T, T)
@@ -1275,80 +1650,90 @@ void attention_forward(floatX* out, floatX* qkvr, floatX* att,
     v = qkvr + 2 * B * T * C;
     int total_threads = B * NH * T * HS;
     int num_blocks = CEIL_DIV(total_threads, block_size);
-    permute_kernel<<<num_blocks, block_size, 0, main_stream>>>(q, k, v, inp, B, T, NH, HS);
-    cudaCheck(cudaGetLastError());
+    permute_kernel<<<num_blocks, block_size>>>(q, k, v, inp, B, T, NH, HS);
 
-    // IMPORTANT: alpha/beta are FP32 for CUBLAS_COMPUTE_32F even if FP16 inputs/outputs
-    // But need FP16 scale for CUBLAS_COMPUTE_16F (no errors otherwise, just garbage results *sigh*)
-    const float alpha = 1.0f;
-    const float beta = 0.0f;
-    const floatX alpha_lowp = (floatX)alpha;
-    const floatX beta_lowp = (floatX)beta;
-    void* alpha_ptr = (CUBLAS_LOWP_COMPUTE == CUBLAS_COMPUTE_16F) ? (void*)&alpha_lowp : (void*)&alpha;
-    void* beta_ptr = (CUBLAS_LOWP_COMPUTE == CUBLAS_COMPUTE_16F) ? (void*)&beta_lowp : (void*)&beta;
 
     floatX* preatt = inp;
     cublasCheck(cublasGemmStridedBatchedEx(cublas_handle,
                                      CUBLAS_OP_T, CUBLAS_OP_N,
-                                     T, T, HS,
-                                     alpha_ptr,
+                                     T, T, HS, &alpha,
                                      k, CUBLAS_LOWP, HS, T * HS,
                                      q, CUBLAS_LOWP, HS, T * HS,
-                                     beta_ptr,
-                                     preatt, CUBLAS_LOWP, T, T * T,
-                                     B * NH,
-                                     CUBLAS_LOWP_COMPUTE,
-                                     CUBLAS_GEMM_DEFAULT));
+                                     &beta, preatt, CUBLAS_LOWP, T, T * T,
+                                     B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT));
 
     // multiply all elements of preatt elementwise by scale
     float scale = 1.0 / sqrtf(HS);
-    int grid_size = CEIL_DIV(B * NH * T * 32, softmax_block_size);
-    softmax_forward_kernel5<<<grid_size, softmax_block_size, 0, main_stream>>>(att, scale, preatt, B * NH, T);
-    cudaCheck(cudaGetLastError());
+    int grid_size = CEIL_DIV(B * NH * T * 32, block_size);
+    softmax_forward_kernel5<<<grid_size, block_size>>>(att, scale, preatt, B * NH, T);
 
     // new approach: first cuBLAS another batched matmul
     floatX* vaccum = inp;
     // y = att @ v # (B, nh, T, T) @ (B, nh, T, hs) -> (B, nh, T, hs)
     cublasCheck(cublasGemmStridedBatchedEx(cublas_handle,
                                      CUBLAS_OP_N, CUBLAS_OP_N,
-                                     HS, T, T,
-                                     alpha_ptr,
+                                     HS, T, T, &alpha,
                                      v, CUBLAS_LOWP, HS, T * HS,
                                      att, CUBLAS_LOWP, T, T * T,
-                                     beta_ptr,
-                                     vaccum, CUBLAS_LOWP, HS, T * HS,
-                                     B * NH,
-                                     CUBLAS_LOWP_COMPUTE,
-                                     CUBLAS_GEMM_DEFAULT));
+                                     &beta, vaccum, CUBLAS_LOWP, HS, T * HS,
+                                     B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT));
 
     // now unpermute
     // y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
     num_blocks = CEIL_DIV(B * T * C, block_size);
-    unpermute_kernel<<<num_blocks, block_size, 0, main_stream>>>(vaccum, out, B, T, NH, HS);
+    unpermute_kernel<<<num_blocks, block_size>>>(vaccum, out, B, T, NH, HS);
     cudaCheck(cudaGetLastError());
 }
 
-void residual_forward(floatX* out, floatX* inp1, floatX* inp2, int N) {
+void residual_forward(floatX* out, const floatX* inp1, const floatX* inp2, int N) {
     NVTX_RANGE_FN();
     const int block_size = 256;
+    assert(N % block_size == 0);
     const int grid_size = CEIL_DIV(N, block_size * x128::size);
-    residual_forward_kernel<<<grid_size, block_size, 0, main_stream>>>(out, inp1, inp2, N);
+    residual_forward_kernel<<<grid_size, block_size>>>(out, inp1, inp2);
+    cudaCheck(cudaGetLastError());
+}
+
+void fused_residual_forward5(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                             const floatX* inp1, const floatX* inp2,
+                             const floatX* weight, const floatX* bias,
+                             int N, int C) {
+    const int block_size = 256;
+    int block_y = block_size / WARP_SIZE;
+    const int grid_size = CEIL_DIV(N, block_y);
+    size_t smem = (2 + block_y) * C * sizeof(floatX);
+
+    // in order to use more than 48 KiB of smem, need to call cudaFuncSetAttribute
+    // this may fail, in which case we fall back to the smem free implementation.
+    cudaCheck(cudaGetLastError());
+    auto status = cudaFuncSetAttribute(fused_residual_forward_kernel5, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
+    cudaGetLastError();
+    if(status == cudaSuccess) {
+        fused_residual_forward_kernel5<<<grid_size, dim3(WARP_SIZE, block_y), smem>>>(residual, normed, mean, rstd, inp1, inp2,
+                                                                               weight, bias, N, C);
+    } else {
+        residual_forward(residual, inp1, inp2, N*C);
+        layernorm_forward(normed, mean, rstd, residual, weight, bias, N, 1, C);
+    }
     cudaCheck(cudaGetLastError());
 }
 
+
 void gelu_forward(floatX* out, const floatX* inp, int N) {
     NVTX_RANGE_FN();
     const int block_size = 512;
+    assert(N % block_size == 0);
     const int grid_size = CEIL_DIV(N, block_size * x128::size);
-    gelu_forward_kernel2<<<grid_size, block_size, 0, main_stream>>>(out, inp, N);
+    gelu_forward_kernel2<<<grid_size, block_size>>>(out, inp);
     cudaCheck(cudaGetLastError());
 }
 
 void gelu_backward(floatX* dinp, const floatX* inp, const floatX* dout, const int N) {
     NVTX_RANGE_FN();
     const int block_size = 128;
+    assert(N % block_size == 0);
     const int grid_size = CEIL_DIV(N, block_size * x128::size);
-    gelu_backward_kernel<<<grid_size, block_size, 0, main_stream>>>(dinp, inp, dout, N);
+    gelu_backward_kernel<<<grid_size, block_size>>>(dinp, inp, dout);
     cudaCheck(cudaGetLastError());
 }
 
@@ -1357,59 +1742,60 @@ void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias,
                      float* dbias_buffer,
                      int B, int T, int C, int OC) {
     NVTX_RANGE_FN();
-    float one = 1.0f;
-    float zero = 0.0f;
+    float one = 1.0f, zero = 0.0f;
 
     // backward to bias, if given, does a +=
     if (dbias != NULL) {
-        // Each warp is responsible for 32 * "x128::size" = 256 OCs at BF16 (OC must be a multiple of 256!)
-        // Block size is 512 threads (16 warps) and we reduce those 16 values into 1 at the end
-        // blockDim.x is 32 --> single warp being responsible for those 256 OCs
-        // blockDim.y is 16 --> 16 parallel independent warps processing the same OCs for different BTs
-        // gridDim.x is OC / 256 --> each block processes 256 OCs
-        // grimDim.y is max(1, (cuda_num_SMs * cuda_threads_per_SM) / (512 * gridDim.x)); --> fill up the entire GPU!
-        const int warp_size = 32;
-        const int block_size = 512;
-        const int OC_per_warp = warp_size * x128::size; // 256 at BF16
-        const int block_size_x = 32;
-        const int block_size_y = block_size / block_size_x; // 16
-        const int grid_size_x = OC / OC_per_warp; // e.g. 3 horizontal blocks for 768 OCs at BF16
-        const int grid_size_y = max(1, cuda_threads_per_SM * cuda_num_SMs / (block_size * grid_size_x)); // full GPU!
-
-        assert((OC % OC_per_warp) == 0); // there is no bounds checking in the kernel to maximise performance
-
-        cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float), main_stream);
-        matmul_backward_bias_kernel6<<<dim3(grid_size_x, grid_size_y),
-                                       dim3(block_size_x, block_size_y),
-                                       OC_per_warp * sizeof(float), main_stream>>>(dbias_buffer, dout, B, T, OC);
-        cast_and_add_kernel<<<CEIL_DIV(OC, 256), 256, 0, main_stream>>>(dbias, dbias_buffer, OC);
-        cudaCheck(cudaGetLastError());
+        // Each warp is responsible for 8 * "x128::size" = 64 OCs at BF16 (OC must be a multiple of 64!)
+        // Block size is 1024 | 768 threads (32|24 warps) and we reduce those values into 1 at the end
+
+        const int block_size = deviceProp.maxThreadsPerMultiProcessor == 1536 ? 768 : 1024;
+
+        dim3 block_dim = {4, 8, (unsigned)block_size/WARP_SIZE};
+        const int OC_per_warp = block_dim.y * x128::size; // 64 at BF16
+        const int grid_size_x = CEIL_DIV(OC, OC_per_warp); // e.g. 12 horizontal blocks for 768 OCs at BF16
+        const int grid_size_y = max(1, deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount / (block_size * grid_size_x)); // full GPU!
+
+        // If we have enough OC that we don't need cross-block reductions, we can skip the bias_buffer accumulation
+        // and write results directly to the output.
+        if(grid_size_y == 1) {
+            matmul_backward_bias_kernel9<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias, dout, B, T, OC, std::bool_constant<false>{});
+            cudaCheck(cudaGetLastError());
+        } else {
+            // kernel 9 overwrites temp buffer, so no need to memset
+            matmul_backward_bias_kernel9<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias_buffer, dout, B, T, OC, std::bool_constant<true>{});
+            cudaCheck(cudaGetLastError());
+            reduce_add_sum_kernel<<<CEIL_DIV(OC, 256 * f128::size), 256>>>(dbias, dbias_buffer, OC, grid_size_y);
+            cudaCheck(cudaGetLastError());
+        }
     }
 
     // backward to input, uses = in the backward pass (set the gradient)
     cublasCheck(cublasGemmEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, C, B*T, OC, &one,
                              weight, CUBLAS_LOWP, C, dout, CUBLAS_LOWP, OC, &zero,
-                             dinp, CUBLAS_LOWP, C, CUBLAS_LOWP_COMPUTE, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    // backward to weight, uses += in the backward pass (accumulate the gradient)
+                             dinp, CUBLAS_LOWP, C, cublas_compute, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    // backward to weight, uses += in the backward pass (accumulate the gradient) by setting alpha=one
     cublasCheck(cublasGemmEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_T, C, OC, B*T, &one,
                              inp, CUBLAS_LOWP, C, dout, CUBLAS_LOWP, OC, &one,
-                             dweight, CUBLAS_LOWP, C, CUBLAS_LOWP_COMPUTE, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+                             dweight, CUBLAS_LOWP, C, cublas_compute, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    cudaCheck(cudaGetLastError());
 }
 
 void layernorm_backward(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
                         const floatX* dout, const floatX* inp, const floatX* weight, const floatX* mean, const floatX* rstd,
                         int B, int T, int C) {
     NVTX_RANGE_FN();
-    const int block_size = 1024;
-    const int grid_size = 1 * cuda_num_SMs;
-    size_t shared_mem_size = (2 * C + 1) * sizeof(float);
-
-    cudaMemsetAsync(scratch, 0, (2 * C + 1) * sizeof(float), main_stream);
+    const int block_size = 512;
+    const int blocks_per_sm = 2; // supported on every architecture and less cache thrashing than 3
+    const int grid_size = blocks_per_sm * deviceProp.multiProcessorCount;
+    size_t shared_mem_size = (2*C + 2*block_size + 1) * sizeof(float);  // see kernel
 
-    layernorm_backward_kernel7<<<grid_size, block_size, shared_mem_size, main_stream>>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C);
+    cudaMemset(scratch, 0, 1 * sizeof(float)); // only need to reset the flag to 0
+    layernorm_backward_kernel9<<<grid_size, block_size, shared_mem_size>>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C);
     cudaCheck(cudaGetLastError());
 }
 
+
 // the sequence of transformations in this compound op is:
 // inp (B,T,3C) -> qkvr (B,T,3C) -> preatt (B,NH,T,T) -> att (B,NH,T,T) -> vaccum (B,T,C) -> out (B,T,C)
 void attention_backward(floatX* dinp, floatX* dqkvr, floatX* dpreatt, floatX* datt, floatX* scratch,
@@ -1419,14 +1805,7 @@ void attention_backward(floatX* dinp, floatX* dqkvr, floatX* dpreatt, floatX* da
     NVTX_RANGE_FN();
     const int block_size = 256;
     int HS = C / NH; // head size
-
-    // FP16 alpha/beta need to be used if and only if CUBLAS_COMPUTE_16F
     const float alpha = 1.0f, beta = 0.0f;
-    const half alpha_fp16 = (half)alpha, beta_fp16 = (half)beta;
-    const void* alpha_ptr = (CUBLAS_LOWP_COMPUTE == CUBLAS_COMPUTE_16F) ?
-                            (const void*)&alpha_fp16 : (const void*)&alpha;
-    const void* beta_ptr =  (CUBLAS_LOWP_COMPUTE == CUBLAS_COMPUTE_16F) ?
-                            (const void*)&beta_fp16 : (const void*)&beta;
 
     // unpack convenience pointers into q, k, v
     const floatX *q, *k, *v;
@@ -1440,48 +1819,59 @@ void attention_backward(floatX* dinp, floatX* dqkvr, floatX* dpreatt, floatX* da
 
     // backward through the unpermute operation
     int num_blocks = CEIL_DIV(B * T * C, block_size);
-    unpermute_kernel_backward<<<num_blocks, block_size, 0, main_stream>>>(scratch, dout, B, T, NH, HS);
-    cudaCheck(cudaGetLastError());
+    unpermute_kernel_backward<<<num_blocks, block_size>>>(scratch, dout, B, T, NH, HS);
     // backward into datt
-
-    cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, T, T, HS, alpha_ptr,
-                                           v, CUBLAS_LOWP, HS, T * HS, scratch, CUBLAS_LOWP, HS, T * HS, beta_ptr,
-                                           datt, CUBLAS_LOWP, T, T * T, B * NH, CUBLAS_LOWP_COMPUTE, CUBLAS_GEMM_DEFAULT));
-
+    cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, T, T, HS, &alpha,
+                                           v, CUBLAS_LOWP, HS, T * HS, scratch, CUBLAS_LOWP, HS, T * HS, &beta,
+                                           datt, CUBLAS_LOWP, T, T * T, B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT));
     // backward into dv
-    cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_T, HS, T, T, alpha_ptr,
-                                           scratch, CUBLAS_LOWP, HS, T * HS, att, CUBLAS_LOWP, T, T * T, beta_ptr,
-                                           dv, CUBLAS_LOWP, HS, T * HS, B * NH, CUBLAS_LOWP_COMPUTE, CUBLAS_GEMM_DEFAULT));
-
+    cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_T, HS, T, T, &alpha,
+                                           scratch, CUBLAS_LOWP, HS, T * HS, att, CUBLAS_LOWP, T, T * T, &beta,
+                                           dv, CUBLAS_LOWP, HS, T * HS, B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT));
     // backward into preatt
     int hs = C / NH; // head size
     float scale = 1.0f / sqrtf(hs);
-    softmax_autoregressive_backward_kernel<<<dim3(T / 4, B * NH), 256, 256, main_stream>>>(dpreatt, datt, att, B, T, C, scale);
-    cudaCheck(cudaGetLastError());
+    softmax_autoregressive_backward_kernel<<<dim3(T / 4, B * NH), 256, 256>>>(dpreatt, datt, att, B, T, C, scale);
     // backward into q
-    cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, HS, T, T, alpha_ptr,
-                                           k, CUBLAS_LOWP, HS, T * HS, dpreatt, CUBLAS_LOWP, T, T * T, beta_ptr,
-                                           dq, CUBLAS_LOWP, HS, T * HS, B * NH, CUBLAS_LOWP_COMPUTE, CUBLAS_GEMM_DEFAULT));
+    cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, HS, T, T, &alpha,
+                                           k, CUBLAS_LOWP, HS, T * HS, dpreatt, CUBLAS_LOWP, T, T * T, &beta,
+                                           dq, CUBLAS_LOWP, HS, T * HS, B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT));
     // backward into k
-    cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_T, HS, T, T, alpha_ptr,
-                                           q, CUBLAS_LOWP, HS, T * HS, dpreatt, CUBLAS_LOWP, T, T * T, beta_ptr,
-                                           dk, CUBLAS_LOWP, HS, T * HS, B * NH, CUBLAS_LOWP_COMPUTE, CUBLAS_GEMM_DEFAULT));
+    cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_T, HS, T, T, &alpha,
+                                           q, CUBLAS_LOWP, HS, T * HS, dpreatt, CUBLAS_LOWP, T, T * T, &beta,
+                                           dk, CUBLAS_LOWP, HS, T * HS, B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT));
     // backward into inp
     num_blocks = CEIL_DIV(B * NH * T * HS, block_size);
-    permute_kernel_backward<<<num_blocks, block_size, 0, main_stream>>>(dinp, dq, dk, dv, B, T, NH, HS);
+    permute_kernel_backward<<<num_blocks, block_size>>>(dinp, dq, dk, dv, B, T, NH, HS);
     cudaCheck(cudaGetLastError());
 }
 
 // replaces logits with logit gradients
 template <typename Type>
-void fused_classifier3(Type* logits, Type* losses,
-                      const Type* dlosses, const int* targets,
+void fused_classifier(Type* logits, Type* losses,
+                      const float dloss, const int* targets,
                       int B, int T, int V, int P) {
     NVTX_RANGE_FN();
     const int block_size = 1024;
     const int N = B * T;
     const int grid_size = N;
-    fused_classifier_kernel3<<<grid_size, block_size, 512, main_stream>>>(logits, losses, (Type*)NULL, dlosses, targets, B, T, V, P);
+    fused_classifier_kernel5<<<grid_size, block_size, 512>>>(logits, losses, (floatX*)NULL, dloss, targets, B, T, V, P);
+    cudaCheck(cudaGetLastError());
+}
+
+template<typename T>
+void global_norm_squared(float* out, const T* values, size_t count) {
+    const int block_size = 512;
+    // launch just enough blocks to fill the grid. deliberately no DIV_CEIL.
+    // having one block less than possible is a tiny performance hit, having
+    // one block too many is catastrophic, since it only can start once all the other
+    // blocks finish. anyway, I think cuda_threads_per_SM should be a multiple of 512
+    // on all gpus, so the division really is going to be exact.
+    const int grid_size = deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount / block_size;
+    assert(grid_size > 0);      // gives a better error than letting the call below fail
+    // initialize out with zero
+    cudaCheck(cudaMemset(out, 0, sizeof(float)));
+    global_norm_squared_kernel<<<grid_size, block_size>>>(out, values, count);
     cudaCheck(cudaGetLastError());
 }
 
@@ -1561,9 +1951,9 @@ void* malloc_and_point_parameters(ParameterTensors* params, size_t* param_elemen
     cudaCheck(cudaMalloc((void**)&params_memory, num_parameters_bytes));
     // assign all the tensors their place in the array
     floatX** ptrs[] = {
-        &params->wte, &params->wpe, (floatX**)&params->ln1w, (floatX**)&params->ln1b, &params->qkvw, &params->qkvb,
-        &params->attprojw, &params->attprojb, (floatX**)&params->ln2w, (floatX**)&params->ln2b, &params->fcw, &params->fcb,
-        &params->fcprojw, &params->fcprojb, (floatX**)&params->lnfw, (floatX**)&params->lnfb
+        &params->wte, &params->wpe, &params->ln1w, &params->ln1b, &params->qkvw, &params->qkvb,
+        &params->attprojw, &params->attprojb, &params->ln2w, &params->ln2b, &params->fcw, &params->fcb,
+        &params->fcprojw, &params->fcprojb, &params->lnfw, &params->lnfb
     };
     char* params_memory_iterator = (char*)params_memory;
     for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
@@ -1593,7 +1983,7 @@ typedef struct {
     floatX* lnf; // (B, T, C)
     floatX* lnf_mean; // (B, T)
     floatX* lnf_rstd; // (B, T)
-    floatX* losses; // (B, T) // todo - no longer used as GPU writes directly to cpu_losses
+    floatX* losses; // (B, T)
     // adding these two compared to the CPU .c code, needed for attention kernel as buffers
     floatX* qkvr; // (L, B, T, 3*C)
     // in inference mode, this buffer will store the logits
@@ -1604,7 +1994,7 @@ typedef struct {
     floatX* output;
 } ActivationTensors;
 
-void fill_in_activation_sizes(size_t* act_sizes, size_t B, size_t T, GPT2Config config) {
+void fill_in_activation_sizes(size_t* act_sizes, size_t B, size_t T, GPT2Config config, int recompute) {
     size_t Vp = config.padded_vocab_size;
     size_t L = config.num_layers;
     size_t NH = config.num_heads;
@@ -1626,7 +2016,8 @@ void fill_in_activation_sizes(size_t* act_sizes, size_t B, size_t T, GPT2Config
     act_sizes[9] = L * B * T; // ln2_mean
     act_sizes[10] = L * B * T; // ln2_rstd
     act_sizes[11] = L * B * T * 4*C; // fch
-    act_sizes[12] = L * B * T * 4*C; // fch_gelu
+    // if recompute >= 1 then we will recompute gelu_forward during backward and use this as scratch buffer
+    act_sizes[12] = (recompute == 0) ? L * B * T * 4*C : B * T * 4*C;
     act_sizes[13] = L * B * T * C; // fcproj
     act_sizes[14] = L * B * T * C; // residual3
     act_sizes[15] = B * T * C; // lnf
@@ -1733,10 +2124,40 @@ typedef struct {
     float mean_loss; // after a forward pass with targets, will be populated with the mean loss
     float accumulated_mean_loss; // Mean loss after aggregating it on all GPUs
     floatX* cpu_losses; // CPU buffer to copy the losses to, allocated with cudaMallocHost
+    float* cpu_losses_fp32; // same but fp32
     unsigned long long rng_state; // the RNG state for seeding stochastic rounding etc.
     int use_master_weights;
+    int recompute;
+    // todo - if other functions need cpu scratch buffers in the future, reuse as generic scratch?
+    int* workload_indices; // encoder_backward, B*T*num_c_groups (int)
+    int4* bucket_info;     // encoder_backward, B*T*num_c_groups (int4) - size for worst case
 } GPT2;
 
+void gpt2_write_to_checkpoint(GPT2 *model, const char* checkpoint_path) {
+    // write the model to a checkpoint file
+    printf0("Writing model to %s\n", checkpoint_path);
+    FILE *model_file = fopenCheck(checkpoint_path, "wb");
+    // write the header first
+    int model_header[256];
+    model_header[0] = 20240326;
+    assert(PRECISION_MODE == PRECISION_FP32 || PRECISION_MODE == PRECISION_BF16);
+    model_header[1] = PRECISION_MODE == PRECISION_FP32 ? 3 : 5;
+    model_header[2] = model->config.max_seq_len;
+    model_header[3] = model->config.vocab_size;
+    model_header[4] = model->config.num_layers;
+    model_header[5] = model->config.num_heads;
+    model_header[6] = model->config.channels;
+    model_header[7] = model->config.padded_vocab_size;
+    fwrite(model_header, sizeof(int), 256, model_file);
+    // write the parameters
+    void* params_memory_cpu = (void*)mallocCheck(model->num_parameters_bytes);
+    cudaCheck(cudaMemcpy(params_memory_cpu, model->params_memory, model->num_parameters_bytes, cudaMemcpyDeviceToHost));
+    fwrite(params_memory_cpu, 1, model->num_parameters_bytes, model_file);
+    free(params_memory_cpu);
+    // close file, we're done
+    fcloseCheck(model_file);
+}
+
 void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
 
     if (PRECISION_MODE == PRECISION_FP16) {
@@ -1760,6 +2181,17 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
         fprintf(stderr, "---> HINT: try to re-run `python train_gpt2.py`\n");
         exit(EXIT_FAILURE);
     }
+    if (PRECISION_MODE == PRECISION_BF16 && version != 5) {
+        fprintf(stderr, "Precision is configured as BF16 but model at %s is not.\n", checkpoint_path);
+        fprintf(stderr, "---> HINT: are you sure you're loading a _bf16.bin file?\n");
+        exit(EXIT_FAILURE);
+    }
+    if (PRECISION_MODE == PRECISION_FP32 && version != 3) {
+        fprintf(stderr, "Precision is configured as FP32 but model at %s is not.\n", checkpoint_path);
+        fprintf(stderr, "---> HINT: to turn on FP32 you have to compile like: `make train_gpt2cu PRECISION=FP32`\n");
+        fprintf(stderr, "---> HINT: are you sure you're loading a .bin file without any _bf16 in the name?\n");
+        exit(EXIT_FAILURE);
+    }
 
     // read in hyperparameters
     model->config.max_seq_len = model_header[2];
@@ -1783,7 +2215,7 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
     model->params_memory = malloc_and_point_parameters(&model->params, model->param_elements, model->param_sizeof);
 
     // read in all the parameters from file and copy them to device
-    float* params_memory_cpu = (float*)mallocCheck(model->num_parameters_bytes);
+    void* params_memory_cpu = (void*)mallocCheck(model->num_parameters_bytes);
     freadCheck(params_memory_cpu, 1, model->num_parameters_bytes, model_file);
     cudaCheck(cudaMemcpy(model->params_memory, params_memory_cpu, model->num_parameters_bytes, cudaMemcpyHostToDevice));
     free(params_memory_cpu);
@@ -1799,14 +2231,123 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
     model->inputs = NULL;
     model->targets = NULL;
     model->cpu_losses = NULL;
+    model->cpu_losses_fp32 = NULL;
+    model->workload_indices = NULL;
+    model->bucket_info = NULL;
     model->batch_size = 0;
     model->seq_len = 0;
     model->mean_loss = -1.0f; // -1.0f will designate no loss
     model->rng_state = 13371337;
     model->use_master_weights = 1; // keep master weights copy in float for optim update?
+    model->recompute = 1; // default to recompute gelu during backward
+}
+
+void gpt2_build_from_random(GPT2 *model, int depth) {
+    // init random (training from scratch)
+
+    // parameterize the size of gpt2 based only on the depth of the model (num_layers)
+    model->config.num_layers = depth;
+    // follows GPT-2 sizes
+    int channels, num_heads;
+    if      (depth == 6)  { channels = 384; num_heads = 6; } // gpt2-tiny (30M)
+    else if (depth == 12) { channels = 768; num_heads = 12; } // gpt2 (124M)
+    else if (depth == 24) { channels = 1024; num_heads = 16; } // gpt2-medium (350M)
+    else if (depth == 36) { channels = 1280; num_heads = 20; } // gpt2-large (774M)
+    else if (depth == 48) { channels = 1600; num_heads = 25; } // gpt2-xl (1558M)
+    else { fprintf(stderr, "Unsupported depth for now\n"); exit(EXIT_FAILURE); }
+    model->config.channels = channels;
+    model->config.num_heads = num_heads;
+    model->config.max_seq_len = 1024;
+    model->config.vocab_size = 50257;
+    model->config.padded_vocab_size = 50304; // padded to 128
+
+    // fill in all the parameter tensor dimensions and types
+    fill_in_parameter_sizes(model->param_elements, model->param_sizeof, model->config);
+    model->num_parameters = 0;
+    model->num_parameters_bytes = 0;
+    for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
+        model->num_parameters += model->param_elements[i];
+        model->num_parameters_bytes += model->param_elements[i] * model->param_sizeof[i];
+    }
+    // create memory for model parameters on the device
+    model->params_memory = malloc_and_point_parameters(&model->params, model->param_elements, model->param_sizeof);
+
+    // allocate and random init the memory for all the parameters with GPT-2 schema
+    // weights ~N(0, 0.02), biases 0, c_proj weights ~N(0, 0.02/(2*L)**0.5)
+    // NOTE: assuming all parameters are of the type floatX, could be relaxed later
+    mt19937_state init_rng;
+    manual_seed(&init_rng, 42);
+    floatX* params_memory_cpu = (floatX*)mallocCheck(model->num_parameters_bytes);
+    memset(params_memory_cpu, 0, model->num_parameters_bytes);
+    // fill in all the weights with random values
+    float residual_scale = 1.0f / sqrtf(2.0f * model->config.num_layers);
+    // we have to init all these tensors exactly in the order that PyTorch initializes them
+    // so that we can match them up and get correctness and exactly the same initial conditions
+    size_t L = model->config.num_layers;
+    size_t offset = 0;
+    for (int l = 0; l < L; l++) {
+        offset = 0;
+        for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
+            // the layernorm parameters are all initialized to 1
+            if (l == 0 && (i == 2 || i == 8 || i == 14)) { // only at l = 0 to init these just once
+                for (size_t j = 0; j < model->param_elements[i]; j++) {
+                    params_memory_cpu[offset + j] = 1.0f;
+                }
+            }
+            // weights tensors are handled here
+            if ((l == 0 && (i == 0 || i == 1)) // only at l = 0, init the wte and wpe tensors
+              || i == 4 || i == 6 || i == 10 || i == 12) {
+                int n = model->param_elements[i];
+                size_t layer_offset = 0;
+                if (i == 0) {
+                    // for wte tensor (padded vocab) override to init V instead of Vp rows
+                    n = model->config.vocab_size * model->config.channels;
+                }
+                if (i == 4 || i == 6 || i == 10 || i == 12) {
+                    // weight tensors, we are only initializing layer l
+                    assert(n % L == 0);
+                    n = n / L;
+                    layer_offset = l * n;
+                }
+                // in GPT-2, the projections back into the residual stream are additionally
+                // scaled by 1/sqrt(2*L) for training stability
+                float scale = (i == 6 || i == 12) ? 0.02f * residual_scale : 0.02f;
+                // okay let's draw the random numbers and write them
+                float *fp32_buffer = (float*)mallocCheck(n * sizeof(float));
+                normal_(fp32_buffer, n, 0.0f, scale, &init_rng);
+                for (size_t j = 0; j < n; j++) {
+                    params_memory_cpu[offset + layer_offset + j] = (floatX)fp32_buffer[j];
+                }
+                free(fp32_buffer);
+            }
+            offset += model->param_elements[i];
+        }
+    }
+
+    // copy them to GPU
+    cudaCheck(cudaMemcpy(model->params_memory, params_memory_cpu, model->num_parameters_bytes, cudaMemcpyHostToDevice));
+    free(params_memory_cpu);
+
+    // other inits and defaults
+    model->acts_memory = NULL;
+    model->grads_memory = NULL;
+    model->m_memory = NULL;
+    model->v_memory = NULL;
+    model->master_weights = NULL;
+    model->grads_acts_memory = NULL;
+    model->inputs = NULL;
+    model->targets = NULL;
+    model->cpu_losses = NULL;
+    model->cpu_losses_fp32 = NULL;
+    model->batch_size = 0;
+    model->seq_len = 0;
+    model->mean_loss = -1.0f; // -1.0f designates no loss
+    model->rng_state = 13371337;
+    model->use_master_weights = 1; // keep master weights copy in float for optim update?
+    model->recompute = 1; // default to recompute gelu during backward
 }
 
-void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bool get_loss=true) {
+void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, int grad_accum_steps=1) {
     NVTX_RANGE_FN();
     // targets are optional and could be NULL
     // in this function we must be careful and use size_t instead of int, otherwise
@@ -1839,18 +2380,19 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo
         model->batch_size = B;
         model->seq_len = T;
         // allocate the space
-        fill_in_activation_sizes(model->act_sizes, B, T, model->config);
+        fill_in_activation_sizes(model->act_sizes, B, T, model->config, model->recompute);
         size_t num_activations = 0;
         for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
             num_activations += model->act_sizes[i];
         }
         model->num_activations = num_activations;
+        printf0("allocating %d MiB for activations\n", (int)round(num_activations * sizeof(floatX) / (1024 * 1024)));
         model->acts_memory = malloc_and_point_activations(&model->acts, model->act_sizes);
-        printf0("allocated %d MiB for activations\n", (int)round(num_activations * sizeof(floatX) / (1024 * 1024)));
         // also create memory for caching inputs and targets
         cudaCheck(cudaMalloc((void**)&model->inputs, B * T * sizeof(int)));
         cudaCheck(cudaMalloc((void**)&model->targets, B * T * sizeof(int)));
         cudaCheck(cudaMallocHost((void**)&model->cpu_losses, B * T * sizeof(floatX)));
+        cudaCheck(cudaMallocHost((void**)&model->cpu_losses_fp32, B * T * sizeof(float)));
     } else {
         // validate B,T is consistent with how we've allocated the memory before
         // in principle we could get more clever here in the future, for now this is safest
@@ -1862,27 +2404,25 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo
 
     // copy inputs/targets to the model
     // todo - inputs is copied on default stream so this synchronises CPU/GPU for now
-    cudaCheck(cudaMemcpyAsync(model->inputs, inputs, B * T * sizeof(int), cudaMemcpyHostToDevice, 0));
+    cudaCheck(cudaMemcpy(model->inputs, inputs, B * T * sizeof(int), cudaMemcpyHostToDevice));
     if (targets != NULL) {
-        // memcpy targets in parallel then wait for them before fused_classifier
-        cudaCheck(cudaMemcpyAsync(model->targets, targets, B * T * sizeof(int), cudaMemcpyHostToDevice, parallel_streams[0]));
-        cudaEventRecord(parallel_events[0], parallel_streams[0]);
+        cudaCheck(cudaMemcpy(model->targets, targets, B * T * sizeof(int), cudaMemcpyHostToDevice));
     }
 
     // forward pass
     ParameterTensors params = model->params; // for brevity
     ActivationTensors acts = model->acts;
-    floatX* residual;
     encoder_forward(acts.encoded, model->inputs, params.wte, params.wpe, B, T, C); // encoding goes into residual[0]
 
+    // first layernorm isn't fused
+    layernorm_forward(acts.ln1, acts.ln1_mean, acts.ln1_rstd, acts.encoded, params.ln1w, params.ln1b, B, T, C);
+
     for (int l = 0; l < L; l++) {
         NvtxRange layer_range("Layer", l);
 
-        residual = l == 0 ? acts.encoded : acts.residual3 + (l-1) * B * T * C;
+        floatX* residual = l == 0 ? acts.encoded : acts.residual3 + (l-1) * B * T * C;
 
         // get the pointers of the weights for this layer
-        floatX* l_ln1w = params.ln1w + l * C;
-        floatX* l_ln1b = params.ln1b + l * C;
         floatX* l_qkvw = params.qkvw + l * 3*C * C;
         floatX* l_qkvb = params.qkvb + l * 3*C;
         floatX* l_attprojw = params.attprojw + l * C * C;
@@ -1896,8 +2436,6 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo
 
         // get the pointers of the activations for this layer
         floatX* l_ln1 = acts.ln1 + l * B * T * C;
-        floatX* l_ln1_mean = acts.ln1_mean + l * B * T;
-        floatX* l_ln1_rstd = acts.ln1_rstd + l * B * T;
         floatX* l_qkvr = acts.qkvr + l * B * T * 3*C;
         floatX* l_atty = acts.atty + l * B * T * C;
         floatX* l_attproj = acts.attproj + l * B * T * C;
@@ -1906,13 +2444,13 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo
         floatX* l_ln2_mean = acts.ln2_mean + l * B * T;
         floatX* l_ln2_rstd = acts.ln2_rstd + l * B * T;
         floatX* l_fch = acts.fch + l * B * T * 4*C;
-        floatX* l_fch_gelu = acts.fch_gelu + l * B * T * 4*C;
+        // reuse the same activation buffer at each layer, as we'll re-compute the gelu during backward
+        // very useful because we dramatically reduce VRAM usage, and may be able to fit larger batch size
+        floatX* l_fch_gelu = (model->recompute == 0) ? acts.fch_gelu + l * B * T * 4*C : acts.fch_gelu;
         floatX* l_fcproj = acts.fcproj + l * B * T * C;
         floatX* l_residual3 = acts.residual3 + l * B * T * C;
 
         // now do the forward pass
-        layernorm_forward(l_ln1, l_ln1_mean, l_ln1_rstd, residual, l_ln1w, l_ln1b, B, T, C);
-
         #ifdef ENABLE_CUDNN
         float* l_att = (float*)acts.att + l * B * NH * T; // cuDNN needs a smaller FP32 tensor
         matmul_forward_cublaslt(l_qkvr, l_ln1, l_qkvw, l_qkvb, B, T, C, 3*C);
@@ -1927,57 +2465,59 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo
         #endif
 
         matmul_forward_cublaslt(l_attproj, l_atty, l_attprojw, l_attprojb, B, T, C, C);
-        residual_forward(l_residual2, residual, l_attproj, B*T*C);
-        layernorm_forward(l_ln2, l_ln2_mean, l_ln2_rstd, l_residual2, l_ln2w, l_ln2b, B, T, C);
+        fused_residual_forward5(l_residual2, l_ln2, l_ln2_mean, l_ln2_rstd, residual, l_attproj, l_ln2w, l_ln2b, B*T, C);
         matmul_forward_cublaslt(l_fch, l_ln2, l_fcw, l_fcb, B, T, C, 4*C);
         gelu_forward(l_fch_gelu, l_fch, B*T*4*C);
         matmul_forward_cublaslt(l_fcproj, l_fch_gelu, l_fcprojw, l_fcprojb, B, T, 4*C, C);
-        residual_forward(l_residual3, l_residual2, l_fcproj, B*T*C);
+
+        // OK, fusion across blocks.
+        if(l+1 != L) {
+            floatX* l_ln1 = acts.ln1 + (l + 1) * B * T * C;
+            floatX* l_ln1_mean = acts.ln1_mean + (l + 1) * B * T;
+            floatX* l_ln1_rstd = acts.ln1_rstd + (l + 1) * B * T;
+            const floatX* l_ln1w = params.ln1w + (l + 1) * C;
+            const floatX* l_ln1b = params.ln1b + (l + 1) * C;
+            fused_residual_forward5(l_residual3, l_ln1, l_ln1_mean, l_ln1_rstd, l_residual2, l_fcproj, l_ln1w, l_ln1b,
+                                    B * T, C);
+        } else {
+            fused_residual_forward5(l_residual3, acts.lnf, acts.lnf_mean, acts.lnf_rstd, l_residual2, l_fcproj,
+                                    params.lnfw, params.lnfb,
+                                    B * T, C);
+        }
     }
 
-    residual = acts.residual3 + (L-1) * B * T * C; // last residual is in residual3
-    layernorm_forward(acts.lnf, acts.lnf_mean, acts.lnf_rstd, residual, params.lnfw, params.lnfb, B, T, C);
     matmul_forward_cublaslt(acts.output, acts.lnf, params.wte, NULL, B, T, C, Vp);
 
     // also forward the cross-entropy loss function if we have the targets
     if (targets != NULL) {
         NvtxRange classifier_and_loss_range("classifier_and_loss");
-        // wait on memcpy of targets (definitely finished by now, but better safe than sorry)
-        cudaStreamWaitEvent(main_stream, parallel_events[0], 0);
         // fused classifier: does the forward pass and first part of the backward pass
-        // we're passing dlosses = NULL, which will default them to 1.0f/(B*T), i.e. uniform loss
-        fused_classifier3(acts.output, model->cpu_losses, (floatX*)NULL, model->targets, B, T, V, Vp);
-
-        // the GPU now writes the losses directly to the CPU buffer allocated with cudaMallocHost()
-        // we accumulate cpu_losses at the end of gpt2_backward() waiting on this event
-        cudaEventRecord(loss_event, main_stream);
-
-        // reset mean_loss here so gpt2_backward() knows we have targets
-        model->mean_loss = 0.0f;
+        const float dloss = 1.0f / (B * T * grad_accum_steps); // results in the uniform average loss over all elements
+        fused_classifier(acts.output, acts.losses, dloss, model->targets, B, T, V, Vp);
+        // for convenience also evaluate the mean loss (TODO re-think this compute+sync point)
+        cudaCheck(cudaMemcpy(model->cpu_losses, acts.losses, B * T * sizeof(floatX), cudaMemcpyDeviceToHost));
+        float mean_loss = 0.0f;
+        for (int i = 0; i < B*T; i++) {
+            float loss = (float)(model->cpu_losses[i]);
+            model->cpu_losses_fp32[i] = loss;
+            mean_loss += loss;
+        }
+        mean_loss /= B*T*grad_accum_steps;
+        model->mean_loss = mean_loss;
     } else {
         // if we don't have targets, we don't have loss
         model->mean_loss = -1.0f;
     }
-
-    // accumulate the loss immediately if we are not going to run gpt2_backward(), e.g. inference
-    if (get_loss) {
-        cudaCheck(cudaEventSynchronize(loss_event)); // hopefully finished long ago
-        for (int i=0; i<B*T; i++) { model->mean_loss += (float)(model->cpu_losses[i]); }
-        model->mean_loss /= B*T;
-    }
 }
 
 void gpt2_zero_grad(GPT2 *model) {
     NVTX_RANGE_FN();
     if (model->grads_memory != NULL) {
-        cudaCheck(cudaMemsetAsync(model->grads_memory, 0, model->num_parameters * sizeof(floatX), parallel_streams[0]));
+        cudaCheck(cudaMemset(model->grads_memory, 0, model->num_parameters * sizeof(floatX)));
     }
-    // Allow this to run in parallel with forward pass, but create a dependency with everything after (backwards pass)
-    cudaEventRecord(parallel_events[0], parallel_streams[0]);
-    cudaStreamWaitEvent(main_stream, parallel_events[0], 0);
 }
 
-void gpt2_backward(GPT2 *model) {
+void gpt2_backward(GPT2 *model, int* inputs) {
     NVTX_RANGE_FN();
     // double check we forwarded previously, with targets
     if (model->mean_loss == -1.0f) {
@@ -1988,21 +2528,26 @@ void gpt2_backward(GPT2 *model) {
     // lazily allocate the memory for gradients of the weights and activations, if needed
     if (model->grads_memory == NULL) {
         // allocate buffers for weight gradients
+        printf0("allocating %d MiB for parameter gradients\n", (int)round(model->num_parameters * sizeof(floatX) / (1024 * 1024)));
         model->grads_memory = malloc_and_point_parameters(&model->grads, model->param_elements, model->param_sizeof);
-        printf0("allocated %d MiB for parameter gradients\n", (int)round(model->num_parameters * sizeof(floatX) / (1024 * 1024)));
         // we're going to be clever for the activations backward pass. we don't need to exactly
         // mirror the forward pass activations and we will save memory.
         size_t bw_act_sizes[NUM_ACTIVATION_TENSORS];
         fill_in_grad_act_sizes(bw_act_sizes, model->batch_size, model->seq_len, model->config);
         // count up and allocate the space
-        model->grads_acts_memory = malloc_and_point_backward(&model->grads_acts, bw_act_sizes);
         model->num_grad_acts = 0;
         for (size_t i = 0; i < NUM_BACKWARD_TENSORS; i++) {
             model->num_grad_acts += bw_act_sizes[i];
         }
-        printf0("allocated %d MiB for activation gradients\n", (int)round(model->num_grad_acts * sizeof(floatX) / (1024 * 1024)));
+        printf0("allocating %d MiB for activation gradients\n", (int)round(model->num_grad_acts * sizeof(floatX) / (1024 * 1024)));
+        model->grads_acts_memory = malloc_and_point_backward(&model->grads_acts, bw_act_sizes);
         // init gradients of parameters and activations to zero
         gpt2_zero_grad(model);
+        // initialise cpu scratch buffers for encoder backward
+        size_t num_c_groups = model->config.channels / (WARP_SIZE * x128::size);
+        assert((size_t)(model->batch_size * model->seq_len) * num_c_groups < (1ULL<<31ULL)); // todo - maybe an issue for llama3-400B(?)
+        model->workload_indices = (int*)mallocCheck(sizeof(int) * model->batch_size * model->seq_len * num_c_groups);
+        model->bucket_info = (int4*)mallocCheck(sizeof(int4) * model->batch_size * model->seq_len * num_c_groups);
     }
 
     // convenience shortcuts, size_t instead of int so that pointer arithmetics don't overflow
@@ -2020,13 +2565,11 @@ void gpt2_backward(GPT2 *model) {
     GradActTensors grads_acts = model->grads_acts;
 
     // reset residual stream gradients (put here to work with gradient accumulation)
-    cudaCheck(cudaMemsetAsync(model->grads_acts.residual3, 0, B * T * C * sizeof(floatX), parallel_streams[0]));
-    // allow the memset to run in parallel with the forward pass, but create a dependency with everything after
-    cudaEventRecord(parallel_events[0], parallel_streams[0]);
-    cudaStreamWaitEvent(main_stream, parallel_events[0], 0);
+    cudaCheck(cudaMemset(model->grads_acts.residual3, 0, B * T * C * sizeof(floatX)));
 
     // re-use the output buffer of the forward pass as a scratchpad during backward pass
-    float* scratchF = (float*)acts.output;
+    float*  scratchF = (float*)acts.output;
+    floatX* scratchX = (floatX*)acts.output;
 
     // we kick off the chain rule by filling in dlosses with 1.0f/(B*T)
     // this was done in the fused classifier kernel as last step of forward pass
@@ -2076,7 +2619,7 @@ void gpt2_backward(GPT2 *model) {
         floatX* l_ln2_mean = acts.ln2_mean + l * B * T;
         floatX* l_ln2_rstd = acts.ln2_rstd + l * B * T;
         floatX* l_fch = acts.fch + l * B * T * 4*C;
-        floatX* l_fch_gelu = acts.fch_gelu + l * B * T * 4*C;
+        floatX* l_fch_gelu = (model->recompute == 0) ? acts.fch_gelu + l * B * T * 4*C : acts.fch_gelu;
         // get the pointers of the gradients of the activations for this layer
         // notice that there is no l *, because we just have a single copy, and keep
         // re-using this memory in every Transformer block as we calculate backward pass
@@ -2086,7 +2629,12 @@ void gpt2_backward(GPT2 *model) {
         floatX* dl_btc = (floatX*)acts.lnf;
         floatX* dl_bt4c = (floatX*)grads_acts.bt4c;
 
-        // backprop this layer
+        // start the backward pass for this layer
+        if(model->recompute >= 1) {
+            // recompute >= 1 means we recompute gelu. in this case,
+            // l_fch_gelu is just a buffer, so re-compute the gelu from l_fch here
+            gelu_forward(l_fch_gelu, l_fch, B*T*4*C);
+        }
         matmul_backward(dl_bt4c, dl_fcprojw, dl_fcprojb, dresidual, l_fch_gelu, l_fcprojw, scratchF, B, T, 4*C, C);
         gelu_backward(dl_bt4c, l_fch, dl_bt4c, B*T*4*C);
         matmul_backward(dl_btc, dl_fcw, dl_fcb, dl_bt4c, l_ln2, l_fcw, scratchF, B, T, C, 4 * C);
@@ -2103,7 +2651,6 @@ void gpt2_backward(GPT2 *model) {
         floatX* buffer_a = l_atty;
         floatX* buffer_b = l_fch;        // this is B x T x 4C, so even larger than what we need
         floatX* dl_preatt = (floatX*)grads_acts.preatt; // dedicated scratchpad allocation
-        floatX* scratchX =  (floatX*)acts.output;
         attention_backward(dl_bt4c, buffer_b, dl_preatt, scratchX, buffer_a, dl_btc, l_qkvr, l_att, B, T, C, NH);
         #endif
 
@@ -2112,21 +2659,17 @@ void gpt2_backward(GPT2 *model) {
         // layernorm backward does += to dresidual, so it correctly accumulates gradient for the Attention block above
         layernorm_backward(dresidual, dl_ln1w, dl_ln1b, scratchF, dl_btc, residual, l_ln1w, l_ln1_mean, l_ln1_rstd, B, T, C);
     }
-    encoder_backward(grads.wte, grads.wpe, dresidual, model->inputs, B, T, C);
-
-    // accumulate the loss, this was calculated at the end of gpt2_forward()
-    cudaCheck(cudaEventSynchronize(loss_event)); // hopefully finished long ago
-    for (int i=0; i<B*T; i++) { model->mean_loss += (float)(model->cpu_losses[i]); }
-    model->mean_loss /= B*T;
+    encoder_backward(grads.wte, grads.wpe, scratchX, model->workload_indices, model->bucket_info,
+                     dresidual, model->inputs, inputs, B, T, C, random_u32(&model->rng_state));
 }
 
-// Compute a mean of a single CPU value across all GPU processes. No-op when multi-GPU is disabled.
-float multi_gpu_cpu_float_mean(float value, const MultiGpuConfig* multi_gpu_config) {
+// Compute sum of a single CPU value across all GPU processes. No-op when multi-GPU is disabled.
+float multi_gpu_cpu_float_sum(float value) {
 #ifdef MULTI_GPU
-    // MPI doesn't support all reduce with mean, so we sum up, then divide.
+    // note MPI doesn't support all reduce with mean, only sum
     float result;
     mpiCheck(MPI_Allreduce(&value, &result, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD));
-    return result / multi_gpu_config->num_processes;
+    return result;
 #else
     return value;
 #endif
@@ -2135,51 +2678,111 @@ float multi_gpu_cpu_float_mean(float value, const MultiGpuConfig* multi_gpu_conf
 // Averages out the loss and gradients across all GPUs. No-op when multi-GPU is disabled.
 // todo - this version only works if all the parameters are the same size (floatX)
 void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) {
+#ifdef MULTI_GPU
     NVTX_RANGE_FN();
+    if (multi_gpu_config->num_processes == 1) { return; }
     // Average all losses.
-    model->accumulated_mean_loss = multi_gpu_cpu_float_mean(model->mean_loss, multi_gpu_config);
-#ifdef MULTI_GPU
-    // Average all gradients.
-    ncclCheck(ncclAllReduce(model->grads_memory, model->grads_memory,
-        model->num_parameters,
-        ncclFloatX, ncclAvg,
-        multi_gpu_config->nccl_comm,
-        // use 0 for default stream (always implicitly synchronised)
-        /*stream=*/0));
+    model->accumulated_mean_loss = multi_gpu_cpu_float_sum(model->mean_loss) / multi_gpu_config->num_processes;
+    if(multi_gpu_config->zero_stage == 0) {
+        //  no ZERO == standard DDP: Average all gradients.
+        ncclCheck(ncclAllReduce(model->grads_memory, model->grads_memory,
+                                model->num_parameters,
+                                ncclFloatX, ncclAvg,
+                                multi_gpu_config->nccl_comm, 0));
+    } else if (multi_gpu_config->zero_stage == 1) {
+        // ZERO-1: Get average gradient for local shard
+        floatX* local_grads_memory = (floatX*) model->grads_memory + multi_gpu_config->shard_offset;
+        ncclCheck(ncclReduceScatter(model->grads_memory, local_grads_memory,
+                                    multi_gpu_config->shard_num_parameters,
+                                    ncclFloatX, ncclAvg,
+                                    multi_gpu_config->nccl_comm, 0));
+    }
 #endif
 }
 
-void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t) {
+float gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, float grad_clip, int t, MultiGpuConfig* multi_gpu_config) {
     NVTX_RANGE_FN();
-    // reference: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
+    size_t num_parameters = multi_gpu_config->shard_num_parameters;
+    floatX* params_memory = (floatX*)model->params_memory + multi_gpu_config->shard_offset;
+    floatX* grads_memory = (floatX*)model->grads_memory + multi_gpu_config->shard_offset;
 
-    // lazily allocate the memory for m_memory and v_memory
     if (model->m_memory == NULL) {
-        cudaCheck(cudaMalloc((void**)&model->m_memory, model->num_parameters * sizeof(float)));
-        cudaCheck(cudaMalloc((void**)&model->v_memory, model->num_parameters * sizeof(float)));
-        cudaCheck(cudaMemset(model->m_memory, 0, model->num_parameters * sizeof(float)));
-        cudaCheck(cudaMemset(model->v_memory, 0, model->num_parameters * sizeof(float)));
-        printf0("allocated %zu MiB for AdamW optimizer state m\n", (model->num_parameters * sizeof(float)) >> 20);
-        printf0("allocated %zu MiB for AdamW optimizer state v\n", (model->num_parameters * sizeof(float)) >> 20);
+        printf0("allocating %zu MiB for AdamW optimizer state m\n", (num_parameters * sizeof(float)) >> 20);
+        printf0("allocating %zu MiB for AdamW optimizer state v\n", (num_parameters * sizeof(float)) >> 20);
+        cudaCheck(cudaMalloc((void**)&model->m_memory, num_parameters * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&model->v_memory, num_parameters * sizeof(float)));
+        cudaCheck(cudaMemset(model->m_memory, 0, num_parameters * sizeof(float)));
+        cudaCheck(cudaMemset(model->v_memory, 0, num_parameters * sizeof(float)));
         if (model->use_master_weights == 1) {
-            // allocate one more buffer to keep the master copy of weights as float, and copy the weights over
-            cudaCheck(cudaMalloc((void**)&model->master_weights, model->num_parameters * sizeof(float)));
-            copy_and_cast_kernel<<<CEIL_DIV(model->num_parameters, 512), 512, 0, main_stream>>>(model->master_weights, (floatX*)model->params_memory, model->num_parameters);
+            printf0("allocating %zu MiB for master copy of params\n", (num_parameters * sizeof(float)) >> 20);
+            cudaCheck(cudaMalloc((void**)&model->master_weights, num_parameters * sizeof(float)));
+            copy_and_cast_kernel<<<CEIL_DIV(num_parameters, 512), 512>>>(model->master_weights, params_memory, num_parameters);
             cudaCheck(cudaGetLastError());
-            printf0("allocated %zu MiB for master copy of params\n", (model->num_parameters * sizeof(float)) >> 20);
         }
     }
 
+    // gradient clipping
+    // repurposing this buffer (which isn't needed now) to write grad norm into it
+    float* grad_norm_squared = (float*)model->acts.output;
+    global_norm_squared(grad_norm_squared, (floatX*)model->grads_memory, model->num_parameters);
+    // transfer the gradient norm to CPU
+    float grad_norm_squared_cpu = 0.0f;
+    cudaCheck(cudaMemcpy(&grad_norm_squared_cpu, grad_norm_squared, sizeof(float), cudaMemcpyDeviceToHost));
+    if(!isfinite(grad_norm_squared_cpu)) {
+        // may happen due to some issue (e.g. overflow?)
+        // TODO: later may want to keep a global counter of instabilities like this
+        printf0("[WARNING]: grad norm is not finite, skipping AdamW update\n");
+        return -1.0f;
+    }
+    float grad_norm_cpu = sqrtf(grad_norm_squared_cpu);
+    float grad_scale = (grad_norm_cpu > grad_clip) ? grad_clip / grad_norm_cpu : 1.0f;
+
+    // AdamW update
     int block_size = 512;
-    int num_blocks = CEIL_DIV(model->num_parameters, block_size);
     float beta1_correction = 1.0f - powf(beta1, t);
     float beta2_correction = 1.0f - powf(beta2, t);
     unsigned int seed = random_u32(&model->rng_state);
-    adamw_kernel3<<<num_blocks, block_size, 0, main_stream>>>((floatX*)model->params_memory, model->master_weights,
-                                              (floatX*)model->grads_memory, model->m_memory, model->v_memory,
-                                              model->num_parameters,
-                                              learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed);
+
+    // individually call the adamw_kernel3 on all parameter tensors separately
+    floatX* params_memory_iter = params_memory;
+    float* master_weights_iter = model->master_weights;
+    floatX* grads_memory_iter = grads_memory;
+    float* m_memory_iter = (float*)model->m_memory;
+    float* v_memory_iter = (float*)model->v_memory;
+    for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
+        size_t num_parameters = model->param_elements[i];
+        int num_blocks = CEIL_DIV(num_parameters, block_size);
+        // we only want to weight decay the 2D tensors and leave all 1D tensors alone
+        // in particular this also decays the embedding weights, but this is ok:
+        // - the token embeddings are weight shared and participate in the final projection to logits
+        // - the position embeddings actively participate at every forward/backward pass
+        float wd = (i == 0 || i == 1 || i == 4 || i == 6 || i == 10 || i == 12) ? weight_decay : 0.0f;
+        adamw_kernel3<<<num_blocks, block_size>>>(params_memory_iter, master_weights_iter, grads_memory_iter,
+                                                  m_memory_iter, v_memory_iter, num_parameters,
+                                                  learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, wd,
+                                                  grad_scale, seed);
+        params_memory_iter += num_parameters;
+        if (master_weights_iter != NULL) { master_weights_iter += num_parameters; }
+        grads_memory_iter += num_parameters;
+        m_memory_iter += num_parameters;
+        v_memory_iter += num_parameters;
+    }
     cudaCheck(cudaGetLastError());
+    return grad_norm_cpu;
+}
+
+void gpt2_multi_gpu_gather(GPT2 *model, MultiGpuConfig* multi_gpu_config)
+{
+#ifdef MULTI_GPU
+    if (multi_gpu_config->num_processes == 1) { return; } // 1 process => noop
+    if (multi_gpu_config->zero_stage == 1) {
+        // gather updated shards of model->params_memory from each process
+        ncclCheck(ncclAllGather((floatX*)model->params_memory + multi_gpu_config->shard_offset, (floatX*)model->params_memory,
+                                multi_gpu_config->shard_num_parameters, ncclFloatX,
+                                multi_gpu_config->nccl_comm, 0));
+    }
+    cudaCheck(cudaGetLastError());
+#endif
 }
 
 void gpt2_free(GPT2 *model) {
@@ -2193,89 +2796,43 @@ void gpt2_free(GPT2 *model) {
     cudaCheck(cudaFree(model->inputs));
     cudaCheck(cudaFree(model->targets));
     cudaFreeHost(model->cpu_losses);
+    cudaFreeHost(model->cpu_losses_fp32);
+    free(model->workload_indices);
+    free(model->bucket_info);
 }
 
-#ifndef TESTING
-// if we are TESTING (see test_gpt2.cu), we'll skip the int main below
-
 // ----------------------------------------------------------------------------
-// data loader lite: returns random batches of data from a file of integers
-
-typedef struct {
-    // Distributed data parallel specifics.
-    // Each worker loads it's own chunk of data.
-    int process_rank;
-    int num_processes;
-    // hyperparameters. use size_t to prevent overflow
-    size_t B;
-    size_t T;
-    // input handling and its state
-    FILE* tokens_file;
-    long file_size;
-    long current_position;
-    // output memory
-    int* batch;
-    int* inputs;
-    int* targets;
-    // convenience variables
-    size_t num_batches;
-} DataLoader;
-
-void dataloader_init(DataLoader *loader, const MultiGpuConfig* multi_gpu_config, const char* filename, size_t B, size_t T) {
-    loader->process_rank = multi_gpu_config->process_rank;
-    loader->num_processes = multi_gpu_config->num_processes;
-    loader->B = B;
-    loader->T = T;
-
-    // open the input file for reading
-    loader->tokens_file = fopenCheck(filename, "rb");
-
-    // determine the file size
-    fseekCheck(loader->tokens_file, 0, SEEK_END);
-    loader->file_size = ftell(loader->tokens_file);
-    fseekCheck(loader->tokens_file, 0, SEEK_SET);
-    if (loader->file_size < (B * T + 1) * sizeof(int)) {
-        printf("Error: file size is too small for the batch size and sequence length\n");
-        exit(EXIT_FAILURE);
+// common init & free code for train/test/profile
+void common_start(bool override_enable_tf32 = true, bool print_device_info = true) {
+    cudaGetDeviceProperties(&deviceProp, multi_gpu_config.local_device_idx);
+    if (print_device_info) {
+        printf("[System]\n");
+        printf("Device %d: %s\n", multi_gpu_config.local_device_idx, deviceProp.name);
     }
-    loader->current_position = loader->process_rank * B * T * sizeof(int); // start at the beginning
 
-    // allocate space for B*T + 1 integers to store the inputs and targets
-    // Using CUDA CPU pinned memory for faster PCI Express transfers to GPU
-    // See: https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/
-    cudaMallocHost((void**)&loader->batch, (B * T + 1) * sizeof(int));
-    loader->inputs = loader->batch;
-    loader->targets = loader->batch + 1; // targets are shifted by one
-    // note: we definitely want to advance by B * T; That is the "stride" by which we move
-    // the window of tokens. We only load B * T + 1 tokens because our targets are offset by 1
-    loader->num_batches = loader->file_size / (loader->num_processes * B * T * sizeof(int));
-}
+    // set up cuBLAS and cuBLASLt (and cuDNN if enabled)
+    cublasCheck(cublasCreate(&cublas_handle));
+    cublasCheck(cublasLtCreate(&cublaslt_handle));
+    cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size));
 
-void dataloader_reset(DataLoader *loader) {
-    loader->current_position = 0;
-}
+    // TF32 precision is equivalent to torch.set_float32_matmul_precision('high')
+    bool enable_tf32 = PRECISION_MODE == PRECISION_FP32 && deviceProp.major >= 8 && override_enable_tf32;
+    cublasCheck(cublasSetMathMode(cublas_handle, enable_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH));
+    cublas_compute = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F;
 
-void dataloader_next_batch(DataLoader *loader) {
-    NVTX_RANGE_FN();
-    size_t B = loader->B;
-    size_t T = loader->T;
-    // if we are at the end of the file, loop back to the beginning
-    if (loader->current_position + (loader->num_processes * B * T + 1) * sizeof(int) > loader->file_size) {
-        loader->current_position = loader->process_rank * B * T * sizeof(int);
-    }
-    // read the B*T+1 integers from the file into batch
-    fseekCheck(loader->tokens_file, loader->current_position, SEEK_SET);
-    freadCheck(loader->batch, sizeof(int), B*T+1, loader->tokens_file);
-    // advance the current position by B*T*num_processes integers
-    // note: the "stride" of tokens by which we move each time is definitely B * T
-    loader->current_position += loader->num_processes * B * T * sizeof(int);
+    create_cudnn();
 }
 
-void dataloader_free(DataLoader *loader) {
-    fcloseCheck(loader->tokens_file);
-    cudaFreeHost(loader->batch);
+void common_free(GPT2 &model) {
+    gpt2_free(&model);
+    cudaCheck(cudaFree(cublaslt_workspace));
+    cublasCheck(cublasDestroy(cublas_handle));
+    cublasCheck(cublasLtDestroy(cublaslt_handle));
+    destroy_cudnn();
 }
 
+#ifndef TESTING
+// if we are TESTING (see test_gpt2.cu), we'll skip everything below this point
 // ----------------------------------------------------------------------------
 // sampler: takes probabilities and samples integers from them
 
@@ -2301,15 +2858,37 @@ int sample_softmax(const float* logits, int n, float coin) {
 // ----------------------------------------------------------------------------
 // Logger lite, will probably grow/change some over time
 
+void create_dir_if_not_exists(const char *dir) {
+    struct stat st = {0};
+    if (stat(dir, &st) == -1) {
+        if (mkdir(dir, 0700) == -1) {
+            printf0("ERROR: could not create directory: %s\n", dir);
+            exit(EXIT_FAILURE);
+        }
+        printf0("created directory: %s\n", dir);
+    }
+}
+
 typedef struct {
     FILE *logfile;
     int flush_every; // every how many steps to flush the log
 } Logger;
 
-void logger_init(Logger *logger, const char *filename) {
-    logger->flush_every = 20;
+void logger_init(Logger *logger, const char *log_dir, int process_rank) {
+    logger->flush_every = 10;
     logger->logfile = NULL;
-    if (filename != NULL) { logger->logfile = fopenCheck(filename, "w"); }
+    if (log_dir != NULL && process_rank == 0) {
+        char output_log_file[256];
+        assert(strlen(log_dir) < 200); // being a bit lazy, can relax later maybe
+        snprintf(output_log_file, 256, "%s/main.log", log_dir);
+        logger->logfile = fopenCheck(output_log_file, "w");
+    }
+}
+
+void logger_log_eval(Logger *logger, int step, float val) {
+    if (logger->logfile != NULL) {
+        fprintf(logger->logfile, "s:%d eval:%.4f\n", step, val);
+    }
 }
 
 void logger_log_val(Logger *logger, int step, float val_loss) {
@@ -2321,7 +2900,7 @@ void logger_log_val(Logger *logger, int step, float val_loss) {
 void logger_log_train(Logger *logger, int step, float train_loss) {
     if (logger->logfile != NULL) {
         fprintf(logger->logfile, "s:%d trl:%.4f\n", step, train_loss);
-        if (step % 10 == 0) { fflush(logger->logfile); }
+        if (step % logger->flush_every == 0) { fflush(logger->logfile); }
     }
 }
 
@@ -2333,16 +2912,19 @@ void logger_free(Logger *logger) {
 // CLI, poor man's argparse
 
 void error_usage() {
-    // default run = debugging run with TinyShakespeare
-    // bigger run = train on TinyStories! e.g. val/sample less often, but sample more tokens, write to logfile
     fprintf(stderr, "Usage:   ./train_gpt2cu [options]\n");
-    fprintf(stderr, "Example: ./train_gpt2cu -i data/TinyStories -v 100 -s 100 -g 144 -o stories.log\n");
     fprintf(stderr, "Options:\n");
-    fprintf(stderr, "  -i <string> input dataset prefix (default = data/tiny_shakespeare)\n");
-    fprintf(stderr, "  -o <string> output log file (default = NULL)\n");
-    fprintf(stderr, "  -b <int>    batch size B (default = 4)\n");
+    fprintf(stderr, "  -i <string> train data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_train.bin)\n");
+    fprintf(stderr, "  -j <string> val data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_val.bin)\n");
+    fprintf(stderr, "  -e <string> input from model at this filename (default = gpt2_124M_bf16.bin)\n");
+    fprintf(stderr, "  -o <string> output log dir (default = NULL, no logging)\n");
+    fprintf(stderr, "  -b <int>    (per-GPU, micro) batch size B (default = 4)\n");
     fprintf(stderr, "  -t <int>    sequence length T (default = 1024)\n");
+    fprintf(stderr, "  -d <int>    total desired batch size (default = B * T * num_processes, i.e. no grad accumulation\n");
     fprintf(stderr, "  -l <float>  learning rate (default = 3e-4f)\n");
+    fprintf(stderr, "  -u <int>    learning rate warmup iterations (default = 0, no warmup)\n");
+    fprintf(stderr, "  -q <float>  learning rate decay: final fraction, at end of training (default = 1.0 (no decay))\n");
+    fprintf(stderr, "  -c <float>  weight decay (default = 0.0f)\n");
     fprintf(stderr, "  -x <int>    max_steps of optimization to run (-1 (default) = disable, run 1 epoch)\n");
     fprintf(stderr, "  -v <int>    val_loss_every, how often we evaluate val loss (default = 20)\n");
     fprintf(stderr, "  -m <int>    val_max_batches, up to how many val batches to estimate val loss? (default = 20)\n");
@@ -2351,6 +2933,9 @@ void error_usage() {
     fprintf(stderr, "  -a <int>    overfit a single batch? 0/1. useful for debugging\n");
     fprintf(stderr, "  -f <int>    enable_tf32 override (default: 1, set to 0 to disable tf32)\n");
     fprintf(stderr, "  -w <int>    keep f32 copy of weights for the optimizer? (default: 1)\n");
+    fprintf(stderr, "  -z <int>    zero_stage, Zero Optimization Stage, 0,1,2,3 (default = 0)\n");
+    fprintf(stderr, "  -r <int>    recompute: saves memory at cost of speed. (default = 1), 0 = none. 1 = recompute gelu\n");
+    fprintf(stderr, "  -h <int>    hellaswag eval run? (default = 0)\n");
     exit(EXIT_FAILURE);
 }
 
@@ -2360,11 +2945,17 @@ int main(int argc, char *argv[]) {
     multi_gpu_config = multi_gpu_config_init(&argc, &argv);
 
     // read in the (optional) command line arguments
-    const char* input_dataset_prefix = "data/tiny_shakespeare"; // or e.g. data/TinyStories
-    const char* output_log_file = NULL;
+    const char* train_data_pattern = "dev/data/tinyshakespeare/tiny_shakespeare_train.bin";
+    const char* val_data_pattern = "dev/data/tinyshakespeare/tiny_shakespeare_val.bin";
+    const char* load_filename = "gpt2_124M_bf16.bin"; // bf16 weights of the model
+    const char* output_log_dir = NULL;
     int B = 4; // batch size
     int T = 1024; // sequence length max
+    int total_batch_size = -1; // will be calculated down below later, if not provided
     float learning_rate = 3e-4f;
+    int warmup_iterations = 0;
+    float final_learning_rate_frac = 1.0f; // final fraction of learning rate, at end of training
+    float weight_decay = 0.0f;
     int val_loss_every = 20; // every how many steps do we eval validation loss?
     int val_max_batches = 20; // how many batches max do we eval for validation loss?
     int sample_every = 20; // every how many steps to do inference?
@@ -2373,16 +2964,26 @@ int main(int argc, char *argv[]) {
     int max_steps = -1;
     int override_enable_tf32 = 1;
     int use_master_weights = 1;
+    int recompute = 1; // recompute during backward setting, 0 = none, 1 = recompute gelu
+    int zero_stage = 0; // Zero Optimization Stage for Multi-GPU training
+    float grad_clip  = 1.0f;
+    int hellaswag_eval = 0;
     for (int i = 1; i < argc; i+=2) {
         if (i + 1 >= argc) { error_usage(); } // must have arg after flag
         if (argv[i][0] != '-') { error_usage(); } // must start with dash
         if (strlen(argv[i]) != 2) { error_usage(); } // must be -x (one dash, one letter)
         // read in the args
-        if (argv[i][1] == 'i') { input_dataset_prefix = argv[i+1]; }
-        else if (argv[i][1] == 'o') { output_log_file = argv[i+1]; }
-        else if (argv[i][1] == 'b') { B = atoi(argv[i+1]); } // Per-GPU batch size
+        if (argv[i][1] == 'i') { train_data_pattern = argv[i+1]; }
+        else if (argv[i][1] == 'j') { val_data_pattern = argv[i+1]; }
+        else if (argv[i][1] == 'e') { load_filename = argv[i+1]; }
+        else if (argv[i][1] == 'o') { output_log_dir = argv[i+1]; }
+        else if (argv[i][1] == 'b') { B = atoi(argv[i+1]); } // Per-GPU (micro) batch size
         else if (argv[i][1] == 't') { T = atoi(argv[i+1]); }
+        else if (argv[i][1] == 'd') { total_batch_size = atoi(argv[i+1]); }
         else if (argv[i][1] == 'l') { learning_rate = atof(argv[i+1]); }
+        else if (argv[i][1] == 'u') { warmup_iterations = atoi(argv[i+1]); }
+        else if (argv[i][1] == 'q') { final_learning_rate_frac = atof(argv[i+1]); }
+        else if (argv[i][1] == 'c') { weight_decay = atof(argv[i+1]); }
         else if (argv[i][1] == 'x') { max_steps = atoi(argv[i+1]); }
         else if (argv[i][1] == 'v') { val_loss_every = atoi(argv[i+1]); }
         else if (argv[i][1] == 'm') { val_max_batches = atoi(argv[i+1]); }
@@ -2391,16 +2992,39 @@ int main(int argc, char *argv[]) {
         else if (argv[i][1] == 'a') { overfit_single_batch = atoi(argv[i+1]); }
         else if (argv[i][1] == 'f') { override_enable_tf32 = atoi(argv[i+1]); }
         else if (argv[i][1] == 'w') { use_master_weights = atoi(argv[i+1]); }
+        else if (argv[i][1] == 'c') { grad_clip = atof(argv[i+1]); }
+        else if (argv[i][1] == 'z') { zero_stage = atoi(argv[i+1]); }
+        else if (argv[i][1] == 'r') { recompute = atoi(argv[i+1]); }
+        else if (argv[i][1] == 'h') { hellaswag_eval = atoi(argv[i+1]); }
         else { error_usage(); }
     }
+    // should do a bit more error checking here
+    assert(warmup_iterations >= 0);
+    // check if output_log_dir has a "." in it, because this behavior changed May 24, 2024. take out later
+    if (output_log_dir != NULL && strstr(output_log_dir, ".") != NULL) {
+        fprintf(stderr, "-o (output_log_dir) has a '.', are you specifying a file instead of dir?\n");
+        fprintf(stderr, "(note that this option changed recently, -o used to be file, became dir.)\n");
+        exit(EXIT_FAILURE);
+    }
+    // calculate a sensible default for total batch size by assuming no gradient accumulation
+    if (total_batch_size == -1) { total_batch_size = B * T * multi_gpu_config.num_processes; }
+    // if we're only overfitting a single batch for debugging, let's overfit the first batch
+    // from val instead of train split, because val is smaller and faster. (train_gpt2.py does the same)
+    if (overfit_single_batch == 1) { train_data_pattern = val_data_pattern; }
     printf0("+-----------------------+----------------------------------------------------+\n");
     printf0("| Parameter             | Value                                              |\n");
     printf0("+-----------------------+----------------------------------------------------+\n");
-    printf0("| input dataset prefix  | %-50s |\n", input_dataset_prefix);
-    printf0("| output log file       | %-50s |\n", output_log_file == NULL ? "NULL" : output_log_file);
-    printf0("| batch size B          | %-50d |\n", B);
+    printf0("| train data pattern    | %-50s |\n", train_data_pattern);
+    printf0("| val data pattern      | %-50s |\n", val_data_pattern);
+    printf0("| output log dir        | %-50s |\n", output_log_dir == NULL ? "NULL" : output_log_dir);
+    printf0("| micro batch size B    | %-50d |\n", B);
     printf0("| sequence length T     | %-50d |\n", T);
-    printf0("| learning rate         | %-50e |\n", learning_rate);
+    printf0("| total batch size      | %-50d |\n", total_batch_size);
+    printf0("| learning rate (LR)    | %-50e |\n", learning_rate);
+    printf0("| warmup iterations     | %-50d |\n", warmup_iterations);
+    printf0("| final LR fraction     | %-50e |\n", final_learning_rate_frac);
+    printf0("| weight decay          | %-50e |\n", weight_decay);
+    printf0("| grad_clip             | %-50e |\n", grad_clip);
     printf0("| max_steps             | %-50d |\n", max_steps);
     printf0("| val_loss_every        | %-50d |\n", val_loss_every);
     printf0("| val_max_batches       | %-50d |\n", val_max_batches);
@@ -2408,51 +3032,38 @@ int main(int argc, char *argv[]) {
     printf0("| genT                  | %-50d |\n", genT);
     printf0("| overfit_single_batch  | %-50d |\n", overfit_single_batch);
     printf0("| use_master_weights    | %-50s |\n", use_master_weights ? "enabled" : "disabled");
+    printf0("| recompute             | %-50d |\n", recompute);
     printf0("+-----------------------+----------------------------------------------------+\n");
 
-    // set up the device
-    cudaCheck(cudaSetDevice(multi_gpu_config.local_device_idx));
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, multi_gpu_config.local_device_idx);
-    cuda_num_SMs = deviceProp.multiProcessorCount;
-    cuda_threads_per_SM = deviceProp.maxThreadsPerMultiProcessor;
-    cuda_arch_major = deviceProp.major;
-    cuda_arch_minor = deviceProp.minor;
-
-    cudaCheck(cudaStreamCreate(&main_stream));
-    cudaEventCreateWithFlags(&main_event, cudaEventDisableTiming);
-    cudaEventCreateWithFlags(&loss_event, cudaEventDisableTiming);
-    for (int i = 0; i < num_parallel_streams; i++) {
-        cudaCheck(cudaStreamCreate(&parallel_streams[i]));
-        cudaEventCreateWithFlags(&parallel_events[i], cudaEventDisableTiming);
-    }
+    common_start(override_enable_tf32, false); // common init code for train/test/profile
 
-    // set up cuBLAS and cuBLASLt
-    cublasCheck(cublasCreate(&cublas_handle));
-    cublasCheck(cublasSetStream(cublas_handle, main_stream));
-    cublasCheck(cublasLtCreate(&cublaslt_handle));
-    cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size));
-    // setup compute precision settings for cublas
-    // TF32 precision is equivalent to torch.set_float32_matmul_precision('high')
-    int enable_tf32 = cuda_arch_major >= 8 ? 1 : 0;
-    if (override_enable_tf32 == 0) { enable_tf32 = 0; } // force to zero via arg
-    cublas_compute_type = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F;
-    cublasMath_t cublas_math_mode = enable_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH;
-    cublasCheck(cublasSetMathMode(cublas_handle, cublas_math_mode));
-    if(cublas_compute_type); // unused in BF16 mode, avoid warning
-
-    // set up cuDNN (noop if not available)
-    create_cudnn();
+    const char* precision_str = (PRECISION_MODE == PRECISION_FP32)
+                              ? (cublas_compute == CUBLAS_COMPUTE_32F_FAST_TF32 ? "TF32" : "FP32")
+                              : (PRECISION_MODE == PRECISION_FP16 ? "FP16" : "BF16");
 
     printf0("| device                | %-50s |\n", deviceProp.name);
-    printf0("| TF32                  | %-50s |\n", enable_tf32 ? "enabled" : "disabled");
-    printf0("| precision             | %-50s |\n", precision_mode_str);
+    printf0("| precision             | %-50s |\n", precision_str);
     printf0("+-----------------------+----------------------------------------------------+\n");
 
-    // build the GPT-2 model from a checkpoint
+    // build the GPT-2 model
     GPT2 model;
-    gpt2_build_from_checkpoint(&model, load_filename);
+    // if load_filename is of the form "dX" where X is an integer (e.g. d12), then we build
+    // a random model with the depth of the model specified by X (e.g. 12). otherwise interpret
+    // this variable as a checkpoint filename, and load that checkpoint
+    assert(strlen(load_filename) >= 2);
+    if (load_filename[0] == 'd') {
+        int depth = atoi(load_filename + 1);
+        if (depth > 1 && depth <= 1000) { // we're not going to train models this big right? heh
+            gpt2_build_from_random(&model, depth);
+        } else {
+            exit(EXIT_FAILURE);
+        }
+    } else {
+        gpt2_build_from_checkpoint(&model, load_filename);
+    }
+
     model.use_master_weights = use_master_weights;
+    model.recompute = recompute;
     printf0("| load_filename         | %-50s |\n", load_filename);
     printf0("| max_sequence_length T | %-50d |\n", model.config.max_seq_len);
     printf0("| vocab_size V          | %-50d |\n", model.config.vocab_size);
@@ -2464,37 +3075,55 @@ int main(int argc, char *argv[]) {
     printf0("+-----------------------+----------------------------------------------------+\n");
 
     // build DataLoaders for both train and val
-    char train_tokens_filename[128];
-    char val_tokens_filename[128];
-    assert(strlen(input_dataset_prefix) < 100); // being bit lazy here, make sure we don't overflow
-    // if we're only overfitting a single batch for debugging, let's overfit the first batch
-    // from val instead of train split, because val is smaller and a bit faster
-    const char* train_split = (overfit_single_batch == 1) ? "val" : "train";
-    sprintf(train_tokens_filename, "%s_%s.bin", input_dataset_prefix, train_split);
-    sprintf(val_tokens_filename, "%s_val.bin", input_dataset_prefix);
-    DataLoader train_loader;
-    dataloader_init(&train_loader, &multi_gpu_config, train_tokens_filename, B, T);
-    DataLoader val_loader;
-    dataloader_init(&val_loader, &multi_gpu_config, val_tokens_filename, B, T);
+    DataLoader train_loader, val_loader;
+    dataloader_init(&train_loader, train_data_pattern, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes);
+    dataloader_init(&val_loader, val_data_pattern, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes);
     int train_num_batches = (max_steps == -1) ? train_loader.num_batches : max_steps; // default = 1 epoch
     int val_num_batches = train_loader.num_batches < val_max_batches ? train_loader.num_batches : val_max_batches;
     printf0("| train_num_batches     | %-50d |\n", train_num_batches);
     printf0("| val_num_batches       | %-50d |\n", val_num_batches);
     printf0("+-----------------------+----------------------------------------------------+\n");
 
+    // build an EvalLoader for HellaSwag
+    EvalLoader eval_loader;
+    const char* hellaswag_path = "dev/data/hellaswag/hellaswag_val.bin";
+    const char hellaswag_available = access(hellaswag_path, F_OK) == 0;
+    const char run_hellaswag = hellaswag_eval && hellaswag_available;
+    if (run_hellaswag) {
+        evalloader_init(&eval_loader, hellaswag_path, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes);
+    }
+    printf0("| run hellaswag         | %-50s |\n", run_hellaswag ? "yes" : "no");
+    printf0("+-----------------------+----------------------------------------------------+\n");
+
     // pretty print in a table the multi-gpu configuration as well
+    set_zero_configs(&multi_gpu_config, zero_stage, model.num_parameters);
     printf0("| num_processes         | %-50d |\n", multi_gpu_config.num_processes);
+    printf0("| zero_stage            | %-50d |\n", multi_gpu_config.zero_stage);
     printf0("+-----------------------+----------------------------------------------------+\n");
 
+    // prints outside of pretty table to here and below
+    if (!hellaswag_available) {
+        printf0("HellaSwag eval not found at %s, skipping its evaluation\n", hellaswag_path);
+        printf0("You can run `python dev/data/hellaswag.py` to export and use it with `-h 1`.\n");
+    }
     // more prints related to allocations from gpt2_build_from_checkpoint down here to not mess up our table above
-    printf0("num_parameters: %zu ==> bytes: %zu\n", model.num_parameters, model.num_parameters_bytes);
+    printf0("num_parameters: %zu => bytes: %zu\n", model.num_parameters, model.num_parameters_bytes);
     printf0("allocated %d MiB for model parameters\n", (int)round(model.num_parameters_bytes / (1024 * 1024)));
 
-    // set up the Logger
+    // figure out gradient accumulation from the desired total batch size
+    int tokens_per_fwdbwd = B * T * multi_gpu_config.num_processes; // one micro-batch processes this many tokens
+    assert(total_batch_size % tokens_per_fwdbwd == 0);
+    int grad_accum_steps = total_batch_size / tokens_per_fwdbwd;
+    printf0("batch_size B=%d * seq_len T=%d * num_processes=%d and total_batch_size=%d\n",
+            B, T, multi_gpu_config.num_processes, total_batch_size);
+    printf0("=> setting grad_accum_steps=%d\n", grad_accum_steps);
+
+    // set up logging
+    create_dir_if_not_exists(output_log_dir);
     Logger logger;
-    logger_init(&logger, output_log_file);
+    logger_init(&logger, output_log_dir, multi_gpu_config.process_rank);
 
-    // build the Tokenizer
+    // set up the Tokenizer
     Tokenizer tokenizer;
     tokenizer_init(&tokenizer, "gpt2_tokenizer.bin");
 
@@ -2527,11 +3156,30 @@ int main(int argc, char *argv[]) {
                 val_loss += model.mean_loss;
             }
             val_loss /= val_num_batches;
-            val_loss = multi_gpu_cpu_float_mean(val_loss, &multi_gpu_config);
+            val_loss = multi_gpu_cpu_float_sum(val_loss) / multi_gpu_config.num_processes;
             printf0("val loss %f\n", val_loss);
             logger_log_val(&logger, step, val_loss);
         }
 
+        // once in a while estimate HellaSwag accuracy
+        if (run_hellaswag &&
+           ((step > 0 && step % val_loss_every == 0) || last_step)) {
+            NvtxRange evaluation_range("evaluation");
+            float eval_acc_norm = 0.0f;
+            evalloader_reset(&eval_loader);
+            for (int i = 0; i < eval_loader.num_batches; i++) {
+                if (i % 10 == 0) { printf("evaluating HellaSwag: %d/%d\r", i, eval_loader.num_batches); }
+                evalloader_next_batch(&eval_loader);
+                gpt2_forward(&model, eval_loader.inputs, eval_loader.targets, B, T);
+                int correct = evalloader_stat_losses(&eval_loader, model.cpu_losses_fp32);
+                eval_acc_norm += (float)correct;
+            }
+            // careful because not all ranks may have the exact same allocation of number of examples
+            eval_acc_norm = multi_gpu_cpu_float_sum(eval_acc_norm);
+            printf0("HellaSwag: %d/%d = %f\n", (int)eval_acc_norm, eval_loader.num_examples, eval_acc_norm / eval_loader.num_examples);
+            logger_log_eval(&logger, step, eval_acc_norm / eval_loader.num_examples);
+        }
+
         // once in a while do model inference to print generated text
         if (multi_gpu_config.process_rank == 0 && (step > 0 && (step % sample_every) == 0 || last_step)) {
             NvtxRange generation_range("generation");
@@ -2583,37 +3231,66 @@ int main(int argc, char *argv[]) {
         // the validation/sampling one last time, and then we break right here as we're done.
         if (last_step) { break; }
 
-        // do a training step
+        // --------------- TRAINING SECTION BEGIN -----------------
+        // do one training step, doing forward/backward/update on total_batch_size tokens
         cudaEventRecord(start);
-        if (overfit_single_batch == 0 || (step == 0 && overfit_single_batch == 1)) {
-            // if we're overfitting a single batch, we'll only call this at step = 0
-            dataloader_next_batch(&train_loader);
+        // gradient accumulation loop over micro-batches
+        float lossf = 0.0f; // for getting the mean loss over the accumulation steps
+        for (int micro_step = 0; micro_step < grad_accum_steps; micro_step++) {
+            // fetch the next data batch
+            // and if we're overfitting a single batch, we'll only call this a single time
+            if (overfit_single_batch == 0 ||
+               (overfit_single_batch == 1 && step == 0 && micro_step == 0)) {
+                dataloader_next_batch(&train_loader);
+            }
+            // forward pass. note that we pass in grad_accum_steps, which scales down the loss
+            gpt2_forward(&model, train_loader.inputs, train_loader.targets, B, T, grad_accum_steps);
+            lossf += model.mean_loss; // the mean_loss was normalized by grad_accum_steps inside gpt2_forward
+            // backward pass. all model params accumulate gradients with += inside this inner loop
+            gpt2_backward(&model, train_loader.inputs);
         }
-        gpt2_forward(&model, train_loader.inputs, train_loader.targets, B, T, false);
-        gpt2_zero_grad(&model);
-        gpt2_backward(&model);
-        if (multi_gpu_config.num_processes > 1) {
-            gpt2_multi_gpu_accumulate(&model, &multi_gpu_config);
+        // override the mean loss, accounting for the gradient accumulation loop
+        // this is esp important to do here in multigpu update below, where model.mean_loss gets allreduced
+        model.mean_loss = lossf;
+        // update the parameters
+        gpt2_multi_gpu_accumulate(&model, &multi_gpu_config);
+        // learning rate schedule: warmup linearly to max LR, then cosine decay to LR * final_learning_rate_frac
+        float step_learning_rate = learning_rate;
+        if (step < warmup_iterations) {
+            step_learning_rate = learning_rate * ((float)(step + 1)) / warmup_iterations;
+        } else {
+            float decay_ratio = ((float)(step - warmup_iterations)) / (train_num_batches - warmup_iterations);
+            assert(0.0f <= decay_ratio && decay_ratio <= 1.0f);
+            float coeff = 0.5f * (1.0f + cosf(M_PI * decay_ratio)); // coeff starts at 1 and goes to 0
+            assert(0.0f <= coeff && coeff <= 1.0f);
+            float min_lr = learning_rate * final_learning_rate_frac;
+            step_learning_rate = min_lr + coeff * (learning_rate - min_lr);
         }
-        gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1);
+        // update the model parameters
+        float grad_norm = gpt2_update(&model, step_learning_rate, 0.9f, 0.95f, 1e-8f, weight_decay, grad_clip, step+1, &multi_gpu_config);
+        gpt2_multi_gpu_gather(&model, &multi_gpu_config);
+        // zero out the gradients for the next iteration
+        gpt2_zero_grad(&model);
+        cudaEventRecord(end);
+        cudaCheck(cudaEventSynchronize(end)); // wait for the end event to finish to get correct timings
+        // --------------- TRAINING SECTION END -------------------
+        // everything that follows now is just diagnostics, prints, logging, etc.
 
         // todo - move or double-buffer all of this timing logic to avoid idling the GPU at this point!
-        cudaEventRecord(end);
         float time_elapsed_ms;
-        cudaCheck(cudaEventSynchronize(end)); // wait for the end event to finish to get correct timings
         cudaCheck(cudaEventElapsedTime(&time_elapsed_ms, start, end));
-
-        float tokens_per_second = multi_gpu_config.num_processes * (B * T) / time_elapsed_ms * 1000.0;
+        size_t tokens_processed = (size_t)multi_gpu_config.num_processes * B * T * grad_accum_steps;
+        float tokens_per_second = tokens_processed / time_elapsed_ms * 1000.0f;
         float bias_corrected_ema_tokens_per_second = tokens_per_second; // by default set to non-ema version
         if (step > 0) { // consider the first batch to be a warmup (e.g. cuBLAS/cuDNN initialisation)
-            total_sum_iteration_time_s += time_elapsed_ms / 1000.0;
+            total_sum_iteration_time_s += time_elapsed_ms / 1000.0f;
             // smooth out the tok/s with an exponential moving average, and bias correct just like in AdamW
             ema_tokens_per_second = 0.95f * ema_tokens_per_second + 0.05f * tokens_per_second;
             bias_corrected_ema_tokens_per_second = ema_tokens_per_second / (1.0f - powf(0.95f, step));
         }
         float accumulated_loss = multi_gpu_config.num_processes == 1 ? model.mean_loss : model.accumulated_mean_loss;
-        printf0("step %4d/%d: train loss %f (acc %f) (%f ms, %0f tok/s)\n",
-                step + 1, train_num_batches, model.mean_loss, accumulated_loss,
+        printf0("step %4d/%d: train loss %f norm %.4f lr %.2e (%.2f ms, %.0f tok/s)\n",
+                step + 1, train_num_batches, accumulated_loss, grad_norm, step_learning_rate,
                 time_elapsed_ms, bias_corrected_ema_tokens_per_second);
         logger_log_train(&logger, step, model.mean_loss);
 
@@ -2626,20 +3303,16 @@ int main(int argc, char *argv[]) {
     // free and destroy everything
     cudaCheck(cudaEventDestroy(end));
     cudaCheck(cudaEventDestroy(start));
+    if (run_hellaswag) { evalloader_free(&eval_loader); }
     dataloader_free(&train_loader);
     dataloader_free(&val_loader);
     tokenizer_free(&tokenizer);
-    gpt2_free(&model);
     free(cpu_logits_raw);
     free(cpu_logits);
     free(gen_tokens);
-    destroy_cudnn();
-    cudaCheck(cudaFree(cublaslt_workspace));
-    cublasCheck(cublasDestroy(cublas_handle));
-    cublasCheck(cublasLtDestroy(cublaslt_handle));
     logger_free(&logger);
     multi_gpu_config_free(&multi_gpu_config);
-
+    common_free(model);
     return 0;
 }
 #endif
diff --git a/train_gpt2.py b/train_gpt2.py
index f2fa68c9b..f2b0ae302 100644
--- a/train_gpt2.py
+++ b/train_gpt2.py
@@ -19,6 +19,7 @@
 import os
 import math
 import struct
+import inspect
 from contextlib import nullcontext
 from dataclasses import dataclass
 
@@ -47,6 +48,7 @@ def __init__(self, config):
         self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
         # output projection
         self.c_proj = nn.Linear(config.n_embd, config.n_embd)
+        self.c_proj.LLMC_RESIDUAL_SCALE_FLAG = 1
         # regularization
         self.n_head = config.n_head
         self.n_embd = config.n_embd
@@ -84,6 +86,7 @@ def __init__(self, config):
         self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd)
         self.gelu    = NewGELU()
         self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd)
+        self.c_proj.LLMC_RESIDUAL_SCALE_FLAG = 1
 
     def forward(self, x):
         x = self.c_fc(x)
@@ -126,9 +129,28 @@ def __init__(self, config):
             ln_f = nn.LayerNorm(config.n_embd),
         ))
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.lm_head.LLMC_SKIP_INIT = 1 # don't init this one, we will tie weights
         self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying
 
-    def forward(self, idx, targets=None):
+        # init all weights, use a torch rng object to be very careful
+        self.init_rng = torch.Generator()
+        self.init_rng.manual_seed(42)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            # apply special scaled init to the residual projections, per GPT-2 paper
+            std = 0.02 if not hasattr(module, 'LLMC_RESIDUAL_SCALE_FLAG') else 0.02/math.sqrt(2 * self.config.n_layer)
+            # we want to skip initializing lm_head, which shares parameters with wte
+            # and wte was already initialized down below during the Embedding init
+            if not hasattr(module, 'LLMC_SKIP_INIT'):
+                torch.nn.init.normal_(module.weight, mean=0.0, std=std, generator=self.init_rng)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02, generator=self.init_rng)
+
+    def forward(self, idx, targets=None, return_logits=True):
         device = idx.device
         b, t = idx.size()
         assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
@@ -152,6 +174,10 @@ def forward(self, idx, targets=None):
             logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
             loss = None
 
+        # there are performance reasons why not returning logits is prudent, if not needed
+        if not return_logits:
+            logits = None
+
         return logits, loss
 
     @classmethod
@@ -203,6 +229,31 @@ def from_pretrained(cls, model_type):
 
         return model
 
+    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
+        # start with all of the candidate parameters
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        # filter out those that do not require grad
+        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
+        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
+        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
+        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
+        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
+        optim_groups = [
+            {'params': decay_params, 'weight_decay': weight_decay},
+            {'params': nodecay_params, 'weight_decay': 0.0}
+        ]
+        num_decay_params = sum(p.numel() for p in decay_params)
+        num_nodecay_params = sum(p.numel() for p in nodecay_params)
+        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
+        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
+        # Create AdamW optimizer and use the fused version if it is available
+        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
+        use_fused = fused_available and device_type == 'cuda'
+        extra_args = dict(fused=True) if use_fused else dict()
+        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
+        print(f"using fused AdamW: {use_fused}")
+        return optimizer
+
     @torch.no_grad()
     def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
         """
@@ -390,7 +441,8 @@ def print0(*args, **kwargs):
     # if you'd like to e.g. time the forward pass only, call this script as:
     # python train_gpt2.py --inference_only 1 --write_tensors 0 --sequence_length 1024
     parser = argparse.ArgumentParser()
-    parser.add_argument("--input_bin", type=str, default="data/tiny_shakespeare_val.bin", help="input .bin to train on")
+    parser.add_argument("--input_bin", type=str, default="dev/data/tinyshakespeare/tiny_shakespeare_val.bin", help="input .bin to train on")
+    parser.add_argument("--model", type=str, default="gpt2", help="gpt2|gpt2-medium|gpt2-large|gpt2-xl|d12|d24|d36|d48")
     parser.add_argument("--write_tensors", type=int, default=1, help="write tensors to disk")
     parser.add_argument("--inference_only", type=int, default=0, help="only run inference")
     parser.add_argument("--dtype", type=str, default="float32", help="float32|float16|bfloat16")
@@ -399,12 +451,17 @@ def print0(*args, **kwargs):
     parser.add_argument("--tensorcores", type=int, default=0, help="use tensorcores")
     parser.add_argument("--flash", type=int, default=0, help="use flash attention")
     parser.add_argument("--num_iterations", type=int, default=10, help="number of iterations to run")
-    parser.add_argument("--batch_size", type=int, default=4, help="batch size")
+    parser.add_argument("--batch_size", type=int, default=4, help="batch size, in units of #batch dimensions")
     parser.add_argument("--sequence_length", type=int, default=64, help="sequence length")
+    parser.add_argument("--total_batch_size", type=int, default=256, help="total desired batch size, in units of #tokens")
+    parser.add_argument("--grad_clip", type=float, default=1.0, help="maximum gradient magnitude")
+    parser.add_argument("--weight_decay", type=float, default=0.0, help="weight decay")
+    parser.add_argument("--overfit_single_batch", type=int, default=1, help="overfit just one batch of data")
     args = parser.parse_args()
     B, T = args.batch_size, args.sequence_length
     assert 1 <= T <= 1024
     assert args.dtype in {"float32", "float16", "bfloat16"}
+    assert args.model in {"gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl", "d12", "d24", "d36", "d48"}
 
     # set up DDP (distributed data parallel). torchrun sets this env variable
     ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run?
@@ -418,8 +475,10 @@ def print0(*args, **kwargs):
         device = f'cuda:{ddp_local_rank}'
         torch.cuda.set_device(device)
         master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
-        seed_offset = ddp_rank # each process gets a different seed
+        seed_offset = 0 # each process gets the exact same seed
     else:
+        ddp_rank = 0
+        ddp_local_rank = 0
         ddp_world_size = 1
         master_process = True
         seed_offset = 0
@@ -435,10 +494,19 @@ def print0(*args, **kwargs):
             elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
                 device = "mps"
     print(f"using device: {device}")
+    device_type = 'cuda' if 'cuda' in device else 'cpu'
+
+    # calculate gradient accumulation from the desired total batch size and the current run configuration
+    tokens_per_fwdbwd = B * T * ddp_world_size
+    assert args.total_batch_size % tokens_per_fwdbwd == 0
+    grad_accum_steps = args.total_batch_size // tokens_per_fwdbwd
+    if master_process:
+        print(f"total desired batch size: {args.total_batch_size}")
+        print(f"=> calculated gradient accumulation steps: {grad_accum_steps}")
 
     # set up a context manager following the desired dtype and device
     ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[args.dtype]
-    ctx = torch.amp.autocast(device_type="cuda", dtype=ptdtype) if device == "cuda" else nullcontext()
+    ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext()
 
     # seed the random number generators (in DDP we want different processes to use different offsets)
     # in the code below we don't actually use random numbers because there is no active dataloader
@@ -464,8 +532,19 @@ def print0(*args, **kwargs):
     if master_process and args.write_tensors: # tokenizer is technically not tensors but ok
         write_tokenizer(enc, "gpt2_tokenizer.bin")
 
-    # load the GPT-2 model weights
-    model = GPT.from_pretrained("gpt2")
+    # init the model, either from scratch or from OpenAI pretrained checkpoint
+    if args.model[0] == "d":
+        # from scratch (random weights)
+        model_config = {
+            "d12": GPTConfig(block_size=1024, vocab_size=50257, n_layer=12, n_head=12, n_embd=768),
+            "d24": GPTConfig(block_size=1024, vocab_size=50257, n_layer=24, n_head=16, n_embd=1024),
+            "d36": GPTConfig(block_size=1024, vocab_size=50257, n_layer=36, n_head=20, n_embd=1280),
+            "d48": GPTConfig(block_size=1024, vocab_size=50257, n_layer=48, n_head=25, n_embd=1600),
+        }[args.model]
+        model = GPT(model_config)
+    else:
+        # load the GPT-2 model weights
+        model = GPT.from_pretrained(args.model)
     model.train()
     model.to(device)
     if args.compile:
@@ -479,10 +558,28 @@ def print0(*args, **kwargs):
 
     # load the tokens
     # note we're using val by default instead of train split just because it is smaller/faster
-    assert os.path.isfile(args.input_bin)
+    if not os.path.isfile(args.input_bin):
+        print0(f"ERROR: input .bin file not found: {args.input_bin}")
+        print0("---> HINT: Try to re-run the data prepro script. these recently moved to dev/data")
+        print0("---> HINT: For example re-run: `python dev/data/tinyshakespeare.py`, then re-try")
+        exit(1)
     print0(f"loading cached tokens in {args.input_bin}")
     with open(args.input_bin, "rb") as f:
-        tokens = np.frombuffer(f.read(), dtype=np.int32)
+        # first read the header, which is 256 int32 integers (4 bytes each)
+        header = np.frombuffer(f.read(256*4), dtype=np.int32)
+        if header[0] != 20240520:
+            print0("ERROR: magic number mismatch in the data .bin file!")
+            print0("---> HINT: Are you passing in a correct file with --input_bin?")
+            print0("---> HINT: Dataset encoding changed recently, re-run data prepro or refer again to README")
+            print0("---> HINT: For example re-run: `python dev/data/tinyshakespeare.py`, then re-try")
+            exit(1)
+        assert header[1] == 1, "unsupported version"
+        ntok = header[2] # number of tokens (claimed)
+        # the rest of it are tokens, stored as uint16
+        tokens = np.frombuffer(f.read(), dtype=np.uint16)
+        # convert tokens to int32 because torch can't handle uint16 sad
+        tokens = tokens.astype(np.int32)
+        assert len(tokens) == ntok, "number of tokens read does not match header?"
 
     # np -> tensor, long, on device
     tokens = torch.tensor(tokens)
@@ -492,18 +589,20 @@ def print0(*args, **kwargs):
     def get_batch():
         assert B*T+1 <= len(tokens), "not enough tokens"
         # for 338,025 tokens. E.g. with B=8 T=1024, this will yield 41 batches before looping
-        i = 0
+        i = B*T*ddp_rank
         while True:
             x = tokens[i:i+B*T].view(B, T)
             y = tokens[i+1:i+B*T+1].view(B, T)
             yield x, y
-            i += B*T
+            i += B*T*ddp_world_size
             if i + B*T + 1 >= len(tokens):
                 i = 0 # in prod we'd want to randomize the start point a bit
+                print("We do not expect to reach here in PyTorch right now")
+                import sys; sys.exit()
 
-    # fetch one batch of data, which we will overfit to
+    # fetch one batch of data
     data_iter = iter(get_batch())
-    x, y = next(data_iter) # we'll overfit this batch below
+    x, y = next(data_iter)
     x = x.to(device)
     y = y.to(device)
 
@@ -515,11 +614,14 @@ def get_batch():
         logits, loss = model(x, y)
         loss.backward()
         # save model params, in both float32 and bfloat16
-        write_model(model, "gpt2_124M.bin", dtype="float32")
-        write_model(model, "gpt2_124M_bf16.bin", dtype="bfloat16")
+        model_to_size = {"gpt2": "124M", "gpt2-medium": "355M", "gpt2-large": "774M", "gpt2-xl": "1558M"}
+        model_to_size.update({f"d{d}": f"d{d}" for d in [12, 24, 36, 48]})
+        model_size_str = model_to_size[args.model] # e.g. "124M", or "d12"
+        write_model(model, f"gpt2_{model_size_str}.bin", dtype="float32")
+        write_model(model, f"gpt2_{model_size_str}_bf16.bin", dtype="bfloat16")
         # save x, y, logits, loss, and parameter gradients, for debugging C
         # always store these in fp32 to have an accurate reference (?)
-        write_state(model, x, y, logits, loss, "gpt2_124M_debug_state.bin")
+        write_state(model, x, y, logits, loss, f"gpt2_{model_size_str}_debug_state.bin")
 
     # -------------------------------------------------------------------------
     # STAGE 2: training loop to get timings
@@ -530,21 +632,46 @@ def get_batch():
     raw_model = model.module if ddp else model # always contains the "raw" unwrapped model
 
     # init the optimizer
-    adam_use_fused = device == "cuda" # only works on CUDA (?)
-    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, fused=adam_use_fused)
+    optimizer = raw_model.configure_optimizers(weight_decay=args.weight_decay,
+                                               learning_rate=1e-4, betas=(0.9, 0.95),
+                                               device_type=device)
 
     if device == "cuda":
         torch.cuda.reset_peak_memory_stats()
     timings = []
-    for i in range(args.num_iterations):
+    norm = -1.0   # dummy value to print in inference-only mode
+    for step in range(args.num_iterations):
         t0 = time.time()
-        with ctx:
-            logits, loss = model(x, y)
-            del logits
-        if not args.inference_only:
-            optimizer.zero_grad(set_to_none=True)
-            loss.backward()
-            optimizer.step()
+
+        # micro-batch loop where we do gradient accumulation to reach desired total batch size
+        lossf = 0.0 # for getting the mean loss (as simple float) over the accumulation steps
+        for micro_step in range(grad_accum_steps):
+            # forward pass
+            with ctx:
+                _, loss = model(x, y, return_logits=False)
+                # we have to scale the loss to account for gradient accumulation,
+                # because the gradients just add on each successive backward().
+                # addition of gradients corresponds to a SUM in the objective, but
+                # instead of a SUM we want MEAN, so we scale the loss here
+                loss = loss / grad_accum_steps
+                lossf += loss.item() # keep track of the mean loss
+            # advance the dataset for the next batch
+            if not args.overfit_single_batch:
+                x, y = next(data_iter)
+                x = x.to(device)
+                y = y.to(device)
+            # backward pass
+            if ddp:
+                # we want only the last micro-step to sync grads in a DDP model
+                # the official way to do this is with model.no_sync(), but that is a
+                # context manager that bloats the code, so we just toggle this variable
+                model.require_backward_grad_sync = (micro_step == grad_accum_steps - 1)
+            if not args.inference_only:
+                loss.backward()
+        norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
+        optimizer.step()
+        optimizer.zero_grad(set_to_none=True)
+
         # wait on the CPU for all device work to end so we get accurate per-iteration timings below
         if device == "mps":
             torch.mps.synchronize()
@@ -553,9 +680,9 @@ def get_batch():
         # time and print
         t1 = time.time()
         # the 0th iteration is often an outlier (much slower) => skip logging it
-        tokens_per_second = ddp_world_size * B * T / (t1-t0)
-        print0(f"iteration {i+1}, loss: {loss.item():.4f}, time: {(t1-t0)*1000:.3f}ms, tok/s: {tokens_per_second:.2f}")
-        if i > 0 and i > args.num_iterations - 20:
+        tokens_per_second = grad_accum_steps * ddp_world_size * B * T / (t1-t0)
+        print0(f"step {step+1:4d}/{args.num_iterations}: train loss {lossf:.6f} norm {norm:.4f} lr 1.00e-04 ({(t1-t0)*1000:.3f} ms, {tokens_per_second:.0f} tok/s)")
+        if step > 0 and step > args.num_iterations - 20:
             timings.append(t1-t0)
 
     # print the average of the last 20 timings, to get something smooth-ish
diff --git a/train_gpt2_fp32.cu b/train_gpt2_fp32.cu
index 178288f33..57697bc2f 100644
--- a/train_gpt2_fp32.cu
+++ b/train_gpt2_fp32.cu
@@ -31,6 +31,8 @@ the layernorms are connected to the residuals so we += in layernorm backward.
 #include "utils.h"
 // defines: tokenizer_init, tokenizer_decode, tokenizer_free
 #include "tokenizer.h"
+// defines: dataloader_init, dataloader_reset, dataloader_next_batch, dataloader_free
+#include "dataloader.h"
 
 // ----------------------------------------------------------------------------
 // CUDA utils
@@ -1453,75 +1455,6 @@ void gpt2_free(GPT2 *model) {
 
 #ifndef TESTING
 // if we are TESTING (see test_gpt2.cu), we'll skip the int main below
-
-// ----------------------------------------------------------------------------
-// data loader lite: returns random batches of data from a file of integers
-
-typedef struct {
-    // hyperparameters
-    int B;
-    int T;
-    // input handling and its state
-    FILE* tokens_file;
-    long file_size;
-    long current_position;
-    // output memory
-    int* batch;
-    int* inputs;
-    int* targets;
-    // convenience variables
-    long num_batches;
-} DataLoader;
-
-void dataloader_init(DataLoader *loader, const char* filename, int B, int T) {
-    loader->B = B;
-    loader->T = T;
-
-    // open the input file for reading
-    loader->tokens_file = fopenCheck(filename, "rb");
-
-    // determine the file size
-    fseekCheck(loader->tokens_file, 0, SEEK_END);
-    loader->file_size = ftell(loader->tokens_file);
-    fseekCheck(loader->tokens_file, 0, SEEK_SET);
-    if (loader->file_size < (B * T + 1) * sizeof(int)) {
-        printf("Error: file size is too small for the batch size and sequence length\n");
-        exit(EXIT_FAILURE);
-    }
-    loader->current_position = 0; // start at the beginning
-
-    // allocate space for B*T + 1 integers to store the inputs and targets
-    // Using CUDA CPU pinned memory for faster PCI Express transfers to GPU
-    // See: https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/
-    cudaMallocHost((void**)&loader->batch, (B * T + 1) * sizeof(int));
-    loader->inputs = loader->batch;
-    loader->targets = loader->batch + 1; // targets are shifted by one
-    loader->num_batches = loader->file_size / (B * T * sizeof(int));
-}
-
-void dataloader_reset(DataLoader *loader) {
-    loader->current_position = 0;
-}
-
-void dataloader_next_batch(DataLoader *loader) {
-    int B = loader->B;
-    int T = loader->T;
-    // if we are at the end of the file, loop back to the beginning
-    if (loader->current_position + (B*T+1) * sizeof(int) > loader->file_size) {
-        loader->current_position = 0;
-    }
-    // read the B*T+1 integers from the file into batch
-    fseekCheck(loader->tokens_file, loader->current_position, SEEK_SET);
-    freadCheck(loader->batch, sizeof(int), B*T+1, loader->tokens_file);
-    // advance the current position by B*T integers
-    loader->current_position += B*T * sizeof(int);
-}
-
-void dataloader_free(DataLoader *loader) {
-    fcloseCheck(loader->tokens_file);
-    cudaFreeHost(loader->batch);
-}
-
 // ----------------------------------------------------------------------------
 // sampler: takes probabilities and samples integers from them
 
@@ -1592,12 +1525,10 @@ void logger_free(Logger *logger) {
 // CLI, poor man's argparse
 
 void error_usage() {
-    // default run = debugging run with TinyShakespeare
-    // bigger run = train on TinyStories! e.g. val/sample less often, but sample more tokens, write to logfile
     fprintf(stderr, "Usage:   ./train_gpt2fp32cu [options]\n");
-    fprintf(stderr, "Example: ./train_gpt2fp32cu -i data/TinyStories -v 100 -s 100 -g 144 -o stories.log\n");
     fprintf(stderr, "Options:\n");
-    fprintf(stderr, "  -i <string> input dataset prefix (default = data/tiny_shakespeare)\n");
+    fprintf(stderr, "  -i <string> train data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_train.bin)\n");
+    fprintf(stderr, "  -j <string> val data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_val.bin)\n");
     fprintf(stderr, "  -o <string> output log file (default = NULL)\n");
     fprintf(stderr, "  -b <int>    batch size B (default = 4)\n");
     fprintf(stderr, "  -t <int>    sequence length T (default = 1024)\n");
@@ -1614,7 +1545,8 @@ void error_usage() {
 int main(int argc, char *argv[]) {
 
     // read in the (optional) command line arguments
-    const char* input_dataset_prefix = "data/tiny_shakespeare"; // or e.g. data/TinyStories
+    const char* train_data_pattern = "dev/data/tinyshakespeare/tiny_shakespeare_train.bin";
+    const char* val_data_pattern = "dev/data/tinyshakespeare/tiny_shakespeare_val.bin";
     const char* output_log_file = NULL;
     int B = 4; // batch size
     int T = 1024; // sequence length max
@@ -1628,7 +1560,8 @@ int main(int argc, char *argv[]) {
         if (argv[i][0] != '-') { error_usage(); } // must start with dash
         if (strlen(argv[i]) != 2) { error_usage(); } // must be -x (one dash, one letter)
         // read in the args
-        if (argv[i][1] == 'i') { input_dataset_prefix = argv[i+1]; }
+        if (argv[i][1] == 'i') { train_data_pattern = argv[i+1]; }
+        else if (argv[i][1] == 'j') { val_data_pattern = argv[i+1]; }
         else if (argv[i][1] == 'o') { output_log_file = argv[i+1]; }
         else if (argv[i][1] == 'b') { B = atoi(argv[i+1]); }
         else if (argv[i][1] == 't') { T = atoi(argv[i+1]); }
@@ -1642,7 +1575,8 @@ int main(int argc, char *argv[]) {
     printf("+-----------------------+----------------------------------------------------+\n");
     printf("| Parameter             | Value                                              |\n");
     printf("+-----------------------+----------------------------------------------------+\n");
-    printf("| input dataset prefix  | %-50s |\n", input_dataset_prefix);
+    printf("| train data pattern    | %-50s |\n", train_data_pattern);
+    printf("| val data pattern      | %-50s |\n", val_data_pattern);
     printf("| output log file       | %-50s |\n", output_log_file == NULL ? "NULL" : output_log_file);
     printf("| batch size B          | %-50d |\n", B);
     printf("| sequence length T     | %-50d |\n", T);
@@ -1684,15 +1618,9 @@ int main(int argc, char *argv[]) {
     printf("+-----------------------+----------------------------------------------------+\n");
 
     // build DataLoaders for both train and val
-    char train_tokens_filename[128];
-    char val_tokens_filename[128];
-    assert(strlen(input_dataset_prefix) < 100); // being bit lazy here, make sure we don't overflow
-    sprintf(train_tokens_filename, "%s_train.bin", input_dataset_prefix);
-    sprintf(val_tokens_filename, "%s_val.bin", input_dataset_prefix);
-    DataLoader train_loader;
-    dataloader_init(&train_loader, train_tokens_filename, B, T);
-    DataLoader val_loader;
-    dataloader_init(&val_loader, val_tokens_filename, B, T);
+    DataLoader train_loader, val_loader;
+    dataloader_init(&train_loader, train_data_pattern, B, T, 0, 1);
+    dataloader_init(&val_loader, val_data_pattern, B, T, 0, 1);
     int train_num_batches = train_loader.num_batches; // let's do 1 epoch by default for now
     int val_num_batches = train_loader.num_batches < val_max_batches ? train_loader.num_batches : val_max_batches;
     printf("| train_num_batches     | %-50d |\n", train_num_batches);
diff --git a/utils.h b/utils.h
index 5d594cb6c..a40de67e3 100644
--- a/utils.h
+++ b/utils.h
@@ -24,7 +24,8 @@ FILE *fopen_check(const char *path, const char *mode, const char *file, int line
         fprintf(stderr, "  Line: %d\n", line);
         fprintf(stderr, "  Path: %s\n", path);
         fprintf(stderr, "  Mode: %s\n", mode);
-        fprintf(stderr, "---> HINT: try to re-run `python train_gpt2.py`\n");
+        fprintf(stderr, "---> HINT 1: dataset files/code have moved to dev/data recently (May 20, 2024). You may have to mv them from the legacy data/ dir to dev/data/(dataset), or re-run the data preprocessing script. Refer back to the main README\n");
+        fprintf(stderr, "---> HINT 2: possibly try to re-run `python train_gpt2.py`\n");
         exit(EXIT_FAILURE);
     }
     return fp;