From 3254a51522a339e8a1e8c18acf9ec557c20950b7 Mon Sep 17 00:00:00 2001
From: chinthysl <chin.dev.acc@gmail.com>
Date: Tue, 30 Apr 2024 09:07:01 +0000
Subject: [PATCH 001/172] Zero Optimizations configs

---
 train_gpt2.cu | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 845148016..9f05c37ff 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -337,6 +337,16 @@ typedef struct {
     int process_rank;      // Rank of this process among all MPI processes. 0 if no multi-GPU.
     int num_processes;     // Total number of processes. 1 if no multi-GPU.
     int local_device_idx;  // This process GPU index on current machine. 0 if no multi-GPU.
+
+    // Zero optimization stage - https://fairscale.readthedocs.io/en/stable/deep_dive/oss_sdp_fsdp.html
+    // 0-Disabled
+    // 1-Optimizer State Sharding (OSS)
+    // 2-Optimizer + Gradient State Sharding (SDP)
+    // 3-Optimizer + Gradient + Horizontal Model Sharding (FSDP)
+    int zero_stage;
+    bool zero_active;
+    size_t shard_num_parameters;
+    size_t shard_offset;
 #ifdef MULTI_GPU
     ncclComm_t nccl_comm;  // NCCL communication primitive, used for collective multi-GPU work.
 #endif
@@ -1905,7 +1915,7 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) {
 
 void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t) {
     // reference: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
-
+    
     // lazily allocate the memory for m_memory and v_memory
     if (model->m_memory == NULL) {
         cudaCheck(cudaMalloc((void**)&model->m_memory, model->num_parameters * sizeof(float)));
@@ -2087,6 +2097,7 @@ void error_usage() {
     fprintf(stderr, "  -m <int>    val_max_batches, up to how many val batches to estimate val loss? (default = 20)\n");
     fprintf(stderr, "  -s <int>    sample_every, how often we inference the model (default = 20)\n");
     fprintf(stderr, "  -g <int>    genT, how many steps of inference we do (default = 64)\n");
+    fprintf(stderr, "  -z <int>    zero_stage, Zero Optimization Stage, 0,1,2,3 (default = 0)\n");
     exit(EXIT_FAILURE);
 }
 
@@ -2105,6 +2116,7 @@ int main(int argc, char *argv[]) {
     int val_max_batches = 20; // how many batches max do we eval for validation loss?
     int sample_every = 20; // every how many steps to do inference?
     int genT = 64; // number of steps of inference we will do
+    int zero_stage = 0; // Zero Optimization Stage for Multi-GPU training 
     for (int i = 1; i < argc; i+=2) {
         if (i + 1 >= argc) { error_usage(); } // must have arg after flag
         if (argv[i][0] != '-') { error_usage(); } // must start with dash
@@ -2119,6 +2131,7 @@ int main(int argc, char *argv[]) {
         else if (argv[i][1] == 'm') { val_max_batches = atoi(argv[i+1]); }
         else if (argv[i][1] == 's') { sample_every = atoi(argv[i+1]); }
         else if (argv[i][1] == 'g') { genT = atoi(argv[i+1]); }
+        else if (argv[i][1] == 'z') { zero_stage = atoi(argv[i+1]); }
         else { error_usage(); }
     }
     printf0("+-----------------------+----------------------------------------------------+\n");
@@ -2190,7 +2203,9 @@ int main(int argc, char *argv[]) {
     printf0("+-----------------------+----------------------------------------------------+\n");
 
     // pretty print in a table the multi-gpu configuration as well
+    set_zero_configs(&multi_gpu_config, zero_stage, model.num_parameters);
     printf0("| num_processes         | %-50d |\n", multi_gpu_config.num_processes);
+    printf0("| zero_stage            | %-50d |\n", multi_gpu_config.zero_stage);
     printf0("+-----------------------+----------------------------------------------------+\n");
 
     // more prints related to allocations from gpt2_build_from_checkpoint down here to not mess up our table above

From 2d26ec10b2b4439e4db12bd49b2e765a377982d4 Mon Sep 17 00:00:00 2001
From: chinthysl <chin.dev.acc@gmail.com>
Date: Tue, 30 Apr 2024 09:08:25 +0000
Subject: [PATCH 002/172] setting the zero opt configs

---
 train_gpt2.cu | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 9f05c37ff..c835b6976 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -439,6 +439,36 @@ void printf0(const char *format, ...) {
     }
 }
 
+void set_zero_configs(MultiGpuConfig* multi_gpu_config, int zero_stage, size_t total_parameters) {
+
+    multi_gpu_config->zero_stage = 0;
+    multi_gpu_config->zero_active = false;
+    multi_gpu_config->shard_num_parameters = total_parameters;
+    multi_gpu_config->shard_offset = 0;
+
+#ifdef MULTI_GPU
+        // Check the Zero Stage and define sharding parameters
+        if (zero_stage == 0) {
+            printf0("| Zero Optimization is disabled                                              |\n");
+        }
+        else if (zero_stage == 1) {
+            if (total_parameters % multi_gpu_config->num_processes != 0) {
+                printf0("| Zero Optimization is disabled, Can't equally partition parameters          |\n");
+            }
+            else {
+                printf0("| Zero Stage1 is enabled                                                     |\n");
+                multi_gpu_config->zero_stage = 1;
+                multi_gpu_config->zero_active = true;
+                multi_gpu_config->shard_num_parameters = total_parameters / multi_gpu_config->num_processes;
+                multi_gpu_config->shard_offset = multi_gpu_config->process_rank * (total_parameters / multi_gpu_config->num_processes);
+            }
+        }
+        else{
+            printf0("| Disabling Zero Optimization, Zero Stage2 and Stage3 are not yet supported  |\n");
+        }
+#endif
+}
+
 // ----------------------------------------------------------------------------
 // all the kernels
 

From b3e8abdd54921d0aad8ebd75458153806fd6d53a Mon Sep 17 00:00:00 2001
From: chinthysl <chin.dev.acc@gmail.com>
Date: Tue, 30 Apr 2024 09:09:52 +0000
Subject: [PATCH 003/172] optimizer update per shard and nccl all gather

---
 train_gpt2.cu | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index c835b6976..586656a50 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -1943,28 +1943,39 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) {
 #endif
 }
 
-void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t) {
+void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t, MultiGpuConfig* multi_gpu_config) {
     // reference: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
-    
+    size_t num_parameters = multi_gpu_config->shard_num_parameters;
+    size_t offset = multi_gpu_config->shard_offset;
+    floatX* params_memory = (floatX*)model->params_memory + offset;
+    floatX* grads_memory = (floatX*)model->grads_memory + offset;
+
     // lazily allocate the memory for m_memory and v_memory
     if (model->m_memory == NULL) {
-        cudaCheck(cudaMalloc((void**)&model->m_memory, model->num_parameters * sizeof(float)));
-        cudaCheck(cudaMalloc((void**)&model->v_memory, model->num_parameters * sizeof(float)));
-        cudaCheck(cudaMemset(model->m_memory, 0, model->num_parameters * sizeof(float)));
-        cudaCheck(cudaMemset(model->v_memory, 0, model->num_parameters * sizeof(float)));
-        printf0("allocated %zu MiB for AdamW optimizer state m\n", (model->num_parameters * sizeof(float)) >> 20);
-        printf0("allocated %zu MiB for AdamW optimizer state v\n", (model->num_parameters * sizeof(float)) >> 20);
+        cudaCheck(cudaMalloc((void**)&model->m_memory, num_parameters * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&model->v_memory, num_parameters * sizeof(float)));
+        cudaCheck(cudaMemset(model->m_memory, 0, num_parameters * sizeof(float)));
+        cudaCheck(cudaMemset(model->v_memory, 0, num_parameters * sizeof(float)));
+        printf0("allocated %zu MiB for AdamW optimizer state m\n", (num_parameters * sizeof(float)) >> 20);
+        printf0("allocated %zu MiB for AdamW optimizer state v\n", (num_parameters * sizeof(float)) >> 20);
     }
 
     int block_size = 512;
-    int num_blocks = CEIL_DIV(model->num_parameters, block_size);
+    int num_blocks = CEIL_DIV(num_parameters, block_size);
     float beta1_correction = 1.0f - powf(beta1, t);
     float beta2_correction = 1.0f - powf(beta2, t);
     unsigned int seed = random_u32(&model->rng_state);
-    adamw_kernel3<<<num_blocks, block_size>>>((floatX*)model->params_memory, (floatX*)model->grads_memory, model->m_memory, model->v_memory,
-                                              model->num_parameters,
+    adamw_kernel3<<<num_blocks, block_size>>>(params_memory, grads_memory, model->m_memory, model->v_memory,
+                                              num_parameters,
                                               learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed);
     cudaCheck(cudaGetLastError());
+
+    if (multi_gpu_config->zero_active) {
+        // gather all parameter updates from each process
+        ncclCheck(ncclAllGather(params_memory, (floatX*)model->params_memory,
+                                num_parameters, ncclFloatX, 
+                                multi_gpu_config->nccl_comm, 0)); // using default stream
+    }
 }
 
 void gpt2_free(GPT2 *model) {

From 1f442fdf7f8720227d8e898295a577f0d4e15853 Mon Sep 17 00:00:00 2001
From: chinthysl <chin.dev.acc@gmail.com>
Date: Tue, 30 Apr 2024 09:10:50 +0000
Subject: [PATCH 004/172] fix gpt2_update call

---
 train_gpt2.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 586656a50..a958a24eb 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2346,7 +2346,7 @@ int main(int argc, char *argv[]) {
         if (multi_gpu_config.num_processes > 1) {
             gpt2_multi_gpu_accumulate(&model, &multi_gpu_config);
         }
-        gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1);
+        gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1, &multi_gpu_config);
         cudaCheck(cudaDeviceSynchronize()); // finish all CUDA work to get correct precise timings
         clock_gettime(CLOCK_MONOTONIC, &end);
         double time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;

From d35d2e606bc8003a08b9fb5c873e3d3f7c0a3fde Mon Sep 17 00:00:00 2001
From: chinthysl <chin.dev.acc@gmail.com>
Date: Thu, 2 May 2024 07:31:34 +0000
Subject: [PATCH 005/172] Generalized copy_and_cast_kernel and changes to cater
 model->master_weights

---
 train_gpt2.cu | 56 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 40 insertions(+), 16 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 6b2e59456..df44d78f9 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -358,7 +358,7 @@ typedef struct {
     int num_processes;     // Total number of processes. 1 if no multi-GPU.
     int local_device_idx;  // This process GPU index on current machine. 0 if no multi-GPU.
 
-    // Zero optimization stage - https://fairscale.readthedocs.io/en/stable/deep_dive/oss_sdp_fsdp.html
+    // Zero Redundancy Optimizer stage - https://fairscale.readthedocs.io/en/stable/deep_dive/oss_sdp_fsdp.html
     // 0-Disabled
     // 1-Optimizer State Sharding (OSS)
     // 2-Optimizer + Gradient State Sharding (SDP)
@@ -1325,10 +1325,37 @@ __global__ void fused_classifier_kernel3(floatX* logits, floatX* losses, floatX*
     }
 }
 
-__global__ void copy_and_cast_kernel(float* dst, const floatX* src, size_t n) {
-    // a small kernel to copy and cast, i.e. `dst <- (float) src`
-    const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < n) { dst[i] = (float)src[i]; }
+// device functions and the kernel to cast data between types
+template<typename Td, typename Ts>
+__device__ Td cast_value(Ts val);
+
+template<>
+__device__ float cast_value<float, half>(half val) {
+    return __half2float(val);
+}
+
+template<>
+__device__ half cast_value<half, float>(float val) {
+    return __float2half(val);
+}
+
+template<>
+__device__ __nv_bfloat16 cast_value<__nv_bfloat16, float>(float val) {
+    return __float2bfloat16(val);
+}
+
+template<>
+__device__ float cast_value<float, __nv_bfloat16>(__nv_bfloat16 val) {
+    return __bfloat162float(val);
+} 
+
+template<typename Td, typename Ts>
+__global__ void copy_and_cast_kernel(Td* dst, const Ts* src, size_t n) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x; 
+    // need to try grid stride looping for more perf later
+    if (idx < n) {
+        dst[idx] = cast_value<Td, Ts>(src[idx]);
+    }
 }
 
 // ----------------------------------------------------------------------------
@@ -2282,10 +2309,6 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
     size_t offset = multi_gpu_config->shard_offset;
     floatX* params_memory = (floatX*)model->params_memory + offset;
     floatX* grads_memory = (floatX*)model->grads_memory + offset;
-    float* master_params = NULL;
-    if (model->use_master_weights == 1) {
-        master_weights = model->master_weights + offset;
-    }
 
     // lazily allocate the memory for m_memory and v_memory
     if (model->m_memory == NULL) {
@@ -2304,6 +2327,11 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
         }
     }
 
+    float* master_weights = NULL;
+    if (model->use_master_weights == 1) {
+        master_weights = model->master_weights + offset;
+    }
+
     int block_size = 512;
     int num_blocks = CEIL_DIV(num_parameters, block_size);
     float beta1_correction = 1.0f - powf(beta1, t);
@@ -2314,17 +2342,13 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
     cudaCheck(cudaGetLastError());
 
     if (multi_gpu_config->zero_active) {
-        // gather all parameter updates from each process, should use 2 cudastreams in future
-        ncclCheck(ncclAllGather(params_memory, (floatX*)model->params_memory,
-                                num_parameters, ncclFloatX,
-                                multi_gpu_config->nccl_comm, 0));
+        // gather all parameter updates from each process
         if (model->use_master_weights == 1) {
             ncclCheck(ncclAllGather(master_weights, model->master_weights,
                                     num_parameters, ncclFloat,
                                     multi_gpu_config->nccl_comm, 0));
-            // Fix and generalize the kernel
-            // copy_and_cast_kernel<<<CEIL_DIV(model->num_parameters, 512), 512>>>((floatX*)model->params_memory, model->master_weights, model->num_parameters);
-            
+            // Copy and cast gathered master weights to params 
+            copy_and_cast_kernel<<<CEIL_DIV(model->num_parameters, 512), 512>>>((floatX*)model->params_memory, model->master_weights, model->num_parameters);
         }
         else {
             ncclCheck(ncclAllGather(params_memory, (floatX*)model->params_memory,

From 632caf1ce645695754ce2032b9e8ef126c97574c Mon Sep 17 00:00:00 2001
From: chinthysl <chin.dev.acc@gmail.com>
Date: Thu, 2 May 2024 07:38:49 +0000
Subject: [PATCH 006/172] Refactored zero_active var

---
 train_gpt2.cu | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index df44d78f9..ba15a9b02 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -364,7 +364,6 @@ typedef struct {
     // 2-Optimizer + Gradient State Sharding (SDP)
     // 3-Optimizer + Gradient + Horizontal Model Sharding (FSDP)
     int zero_stage;
-    bool zero_active;
     size_t shard_num_parameters;
     size_t shard_offset;
 #ifdef MULTI_GPU
@@ -462,7 +461,6 @@ void printf0(const char *format, ...) {
 void set_zero_configs(MultiGpuConfig* multi_gpu_config, int zero_stage, size_t total_parameters) {
 
     multi_gpu_config->zero_stage = 0;
-    multi_gpu_config->zero_active = false;
     multi_gpu_config->shard_num_parameters = total_parameters;
     multi_gpu_config->shard_offset = 0;
 
@@ -474,17 +472,18 @@ void set_zero_configs(MultiGpuConfig* multi_gpu_config, int zero_stage, size_t t
         else if (zero_stage == 1) {
             if (total_parameters % multi_gpu_config->num_processes != 0) {
                 printf0("| Zero Optimization is disabled, Can't equally partition parameters          |\n");
+                multi_gpu_config->zero_stage = 0;
             }
             else {
                 printf0("| Zero Stage1 is enabled                                                     |\n");
                 multi_gpu_config->zero_stage = 1;
-                multi_gpu_config->zero_active = true;
                 multi_gpu_config->shard_num_parameters = total_parameters / multi_gpu_config->num_processes;
                 multi_gpu_config->shard_offset = multi_gpu_config->process_rank * (total_parameters / multi_gpu_config->num_processes);
             }
         }
         else{
             printf0("| Disabling Zero Optimization, Zero Stage2 and Stage3 are not yet supported  |\n");
+            multi_gpu_config->zero_stage = 0;
         }
 #endif
 }
@@ -2341,7 +2340,7 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
                                               learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed);
     cudaCheck(cudaGetLastError());
 
-    if (multi_gpu_config->zero_active) {
+    if (multi_gpu_config->zero_stage == 1) {
         // gather all parameter updates from each process
         if (model->use_master_weights == 1) {
             ncclCheck(ncclAllGather(master_weights, model->master_weights,

From c81adeb6b254e8fdc4b9fad722c69442806bdb6e Mon Sep 17 00:00:00 2001
From: lancer <tangshao28@gmail.com>
Date: Thu, 2 May 2024 17:36:12 -0700
Subject: [PATCH 007/172] move gelu_backward to backward block

---
 dev/cuda/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/cuda/Makefile b/dev/cuda/Makefile
index 4ea763762..834a98b0f 100644
--- a/dev/cuda/Makefile
+++ b/dev/cuda/Makefile
@@ -26,7 +26,6 @@ attention_forward: attention_forward.cu
 classifier_fused: classifier_fused.cu
 crossentropy_forward: crossentropy_forward.cu
 encoder_forward: encoder_forward.cu
-gelu_backward: gelu_backward.cu
 gelu_forward: gelu_forward.cu
 layernorm_forward: layernorm_forward.cu
 residual_forward: residual_forward.cu
@@ -40,6 +39,7 @@ matmul_forward: matmul_forward.cu
 attention_backward: attention_backward.cu
 crossentropy_softmax_backward: crossentropy_softmax_backward.cu
 encoder_backward: encoder_backward.cu
+gelu_backward: gelu_backward.cu
 layernorm_backward: layernorm_backward.cu
 matmul_backward_bias: matmul_backward_bias.cu
 matmul_backward: matmul_backward.cu

From 59b66f6c07d36c1e7307437f8e1e36470d7ecf12 Mon Sep 17 00:00:00 2001
From: chinthysl <chin.dev.acc@gmail.com>
Date: Sat, 4 May 2024 16:22:44 +0800
Subject: [PATCH 008/172] refactor and fix CI issue

---
 Makefile        |  2 +-
 profile_gpt2.cu |  2 +-
 test_gpt2.cu    |  2 +-
 train_gpt2.cu   | 68 +++++++++++++++++++++++++++++++------------------
 4 files changed, 46 insertions(+), 28 deletions(-)

diff --git a/Makefile b/Makefile
index 06923801d..ae43f4c75 100644
--- a/Makefile
+++ b/Makefile
@@ -207,7 +207,7 @@ test_gpt2fp32cu: test_gpt2_fp32.cu
 	$(NVCC) $(NVCC_FLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(NVCC_LDFLAGS) $(CUDA_OUTPUT_FILE)
 
 profile_gpt2cu: profile_gpt2.cu
-	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $< $(NVCC_LDFLAGS) $(CUDA_OUTPUT_FILE)
+	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $< $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(NVCC_LDFLAGS) $(CUDA_OUTPUT_FILE)
 
 clean:
 	$(REMOVE_FILES) $(TARGETS)
diff --git a/profile_gpt2.cu b/profile_gpt2.cu
index f412eed89..445d9dbe6 100644
--- a/profile_gpt2.cu
+++ b/profile_gpt2.cu
@@ -76,7 +76,7 @@ int main() {
     gpt2_forward(&model, x, y, B, T);
     gpt2_zero_grad(&model);
     gpt2_backward(&model);
-    gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1);
+    gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1, model.num_parameters, 0);
     cudaCheck(cudaDeviceSynchronize()); // finish all CUDA work to get correct precise timings
     // free
     gpt2_free(&model);
diff --git a/test_gpt2.cu b/test_gpt2.cu
index 67afa5065..c6ca4a8a8 100644
--- a/test_gpt2.cu
+++ b/test_gpt2.cu
@@ -281,7 +281,7 @@ int main(int argc, char *argv[]) {
             allok = allok & check_tensor(tensors1[15], tensors2[15], C, "lnfb", 3e-2f);
         }
 
-        gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.01f, step+1);
+        gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.01f, step+1, model.num_parameters, 0);
 
         // print the timing information at the end
         printf("step %d: loss %f (took %f ms)\n", step+1, model.mean_loss, time_elapsed_s * 1000);
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 4f0e32dd3..19dd000e9 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -1379,6 +1379,11 @@ __global__ void fused_classifier_kernel3(floatX* logits, floatX* losses, floatX*
 template<typename Td, typename Ts>
 __device__ Td cast_value(Ts val);
 
+template<>
+__device__ float cast_value<float, float>(float val) {
+    return val;
+}
+
 template<>
 __device__ float cast_value<float, half>(half val) {
     return __half2float(val);
@@ -2373,6 +2378,11 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) {
     // Average all losses.
     model->accumulated_mean_loss = multi_gpu_cpu_float_mean(model->mean_loss, multi_gpu_config);
 #ifdef MULTI_GPU
+    // all gather is only required when num_processes > 1
+    if (multi_gpu_config->num_processes == 1) {
+        return;
+    }
+
     // Average all gradients.
     ncclCheck(ncclAllReduce(model->grads_memory, model->grads_memory,
         model->num_parameters,
@@ -2383,22 +2393,18 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) {
 #endif
 }
 
-void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t, MultiGpuConfig* multi_gpu_config) {
+void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t, size_t shard_num_parameters, size_t shard_offset) {
     NVTX_RANGE_FN();
     // reference: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
-    size_t num_parameters = multi_gpu_config->shard_num_parameters;
-    size_t offset = multi_gpu_config->shard_offset;
-    floatX* params_memory = (floatX*)model->params_memory + offset;
-    floatX* grads_memory = (floatX*)model->grads_memory + offset;
 
-    // lazily allocate the memory for m_memory and v_memory
+    // lazily allocate the memory for m_memory and v_memory according to shard configs
     if (model->m_memory == NULL) {
-        cudaCheck(cudaMalloc((void**)&model->m_memory, num_parameters * sizeof(float)));
-        cudaCheck(cudaMalloc((void**)&model->v_memory, num_parameters * sizeof(float)));
-        cudaCheck(cudaMemset(model->m_memory, 0, num_parameters * sizeof(float)));
-        cudaCheck(cudaMemset(model->v_memory, 0, num_parameters * sizeof(float)));
-        printf0("allocated %zu MiB for AdamW optimizer state m\n", (num_parameters * sizeof(float)) >> 20);
-        printf0("allocated %zu MiB for AdamW optimizer state v\n", (num_parameters * sizeof(float)) >> 20);
+        cudaCheck(cudaMalloc((void**)&model->m_memory, shard_num_parameters * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&model->v_memory, shard_num_parameters * sizeof(float)));
+        cudaCheck(cudaMemset(model->m_memory, 0, shard_num_parameters * sizeof(float)));
+        cudaCheck(cudaMemset(model->v_memory, 0, shard_num_parameters * sizeof(float)));
+        printf0("allocated %zu MiB for AdamW optimizer state m\n", (shard_num_parameters * sizeof(float)) >> 20);
+        printf0("allocated %zu MiB for AdamW optimizer state v\n", (shard_num_parameters * sizeof(float)) >> 20);
         if (model->use_master_weights == 1) {
             // allocate one more buffer to keep the master copy of weights as float, and copy the weights over
             cudaCheck(cudaMalloc((void**)&model->master_weights, model->num_parameters * sizeof(float)));
@@ -2408,35 +2414,48 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
         }
     }
 
+    floatX* params_memory = (floatX*)model->params_memory + shard_offset;
+    floatX* grads_memory = (floatX*)model->grads_memory + shard_offset;
     float* master_weights = NULL;
     if (model->use_master_weights == 1) {
-        master_weights = model->master_weights + offset;
+        master_weights = model->master_weights + shard_offset;
     }
 
     int block_size = 512;
-    int num_blocks = CEIL_DIV(num_parameters, block_size);
+    int num_blocks = CEIL_DIV(shard_num_parameters, block_size);
     float beta1_correction = 1.0f - powf(beta1, t);
     float beta2_correction = 1.0f - powf(beta2, t);
     unsigned int seed = random_u32(&model->rng_state);
-    adamw_kernel3<<<num_blocks, block_size>>>(params_memory, master_weights, grads_memory, model->m_memory, model->v_memory, num_parameters,
+    adamw_kernel3<<<num_blocks, block_size>>>(params_memory, master_weights, grads_memory, model->m_memory, model->v_memory, shard_num_parameters,
                                               learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed);
     cudaCheck(cudaGetLastError());
+}
+
+void gpt2_multi_gpu_gather(GPT2 *model, MultiGpuConfig* multi_gpu_config)
+{
+#ifdef MULTI_GPU
+    // all gather is only required when num_processes > 1
+    if (multi_gpu_config->num_processes == 1) {
+        return;
+    }
 
     if (multi_gpu_config->zero_stage == 1) {
         // gather all parameter updates from each process
         if (model->use_master_weights == 1) {
-            ncclCheck(ncclAllGather(master_weights, model->master_weights,
-                                    num_parameters, ncclFloat,
+            ncclCheck(ncclAllGather(model->master_weights + multi_gpu_config->shard_offset, model->master_weights,
+                                    multi_gpu_config->shard_num_parameters, ncclFloat,
                                     multi_gpu_config->nccl_comm, 0));
-            // Copy and cast gathered master weights to params 
+            // Copy and cast master weights to params 
             copy_and_cast_kernel<<<CEIL_DIV(model->num_parameters, 512), 512>>>((floatX*)model->params_memory, model->master_weights, model->num_parameters);
         }
         else {
-            ncclCheck(ncclAllGather(params_memory, (floatX*)model->params_memory,
-                                    num_parameters, ncclFloatX,
+            ncclCheck(ncclAllGather((floatX*)model->params_memory + multi_gpu_config->shard_offset, (floatX*)model->params_memory,
+                                    multi_gpu_config->shard_num_parameters, ncclFloatX,
                                     multi_gpu_config->nccl_comm, 0));
         }
-    }
+    }  
+    cudaCheck(cudaGetLastError());
+#endif
 }
 
 void gpt2_free(GPT2 *model) {
@@ -2846,10 +2865,9 @@ int main(int argc, char *argv[]) {
         gpt2_forward(&model, train_loader.inputs, train_loader.targets, B, T);
         gpt2_zero_grad(&model);
         gpt2_backward(&model);
-        if (multi_gpu_config.num_processes > 1) {
-            gpt2_multi_gpu_accumulate(&model, &multi_gpu_config);
-        }
-        gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1, &multi_gpu_config);
+        gpt2_multi_gpu_accumulate(&model, &multi_gpu_config);
+        gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1, multi_gpu_config.shard_num_parameters, multi_gpu_config.shard_offset);
+        gpt2_multi_gpu_gather(&model, &multi_gpu_config);
 
         cudaEventRecord(end);
         float time_elapsed_ms;

From bfb9c51446a5b0219c8fa8a2d4ffc06414293f3a Mon Sep 17 00:00:00 2001
From: ademeure <arundemeure@gmail.com>
Date: Sat, 4 May 2024 23:40:15 +0100
Subject: [PATCH 009/172] refactoring & remove unused functions to reduce LOC
 (+wip profile.py improvements)

---
 profile_gpt2.cu   |  51 +---
 profile_gpt2cu.py |  19 +-
 test_gpt2.cu      |  49 +--
 train_gpt2.cu     | 761 +++++++++++++++++++---------------------------
 4 files changed, 328 insertions(+), 552 deletions(-)

diff --git a/profile_gpt2.cu b/profile_gpt2.cu
index eab7fc58e..fd1d78d94 100644
--- a/profile_gpt2.cu
+++ b/profile_gpt2.cu
@@ -28,44 +28,7 @@ the profile.ncu-rep from a cloud box to local to pretty view.
 #include "train_gpt2.cu"
 
 int main() {
-
-    // set up the device
-    int deviceIdx = 0;
-    cudaCheck(cudaSetDevice(deviceIdx));
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, deviceIdx);
-    printf("[System]\n");
-    printf("Device %d: %s\n", deviceIdx, deviceProp.name);
-
-    cuda_num_SMs = deviceProp.multiProcessorCount;
-    cuda_threads_per_SM = deviceProp.maxThreadsPerMultiProcessor;
-    cuda_arch_major = deviceProp.major;
-    cuda_arch_minor = deviceProp.minor;
-
-    cudaCheck(cudaStreamCreate(&main_stream));
-    cudaEventCreateWithFlags(&main_event, cudaEventDisableTiming);
-    cudaEventCreateWithFlags(&loss_event, cudaEventDisableTiming);
-    for (int i = 0; i < num_parallel_streams; i++) {
-        cudaCheck(cudaStreamCreate(&parallel_streams[i]));
-        cudaEventCreateWithFlags(&parallel_events[i], cudaEventDisableTiming);
-    }
-
-    // setup cuBLAS and cuBLASLt
-    cublasCheck(cublasCreate(&cublas_handle));
-    cublasCheck(cublasSetStream(cublas_handle, main_stream));
-    cublasCheck(cublasLtCreate(&cublaslt_handle));
-    // TF32 precision is equivalent to torch.set_float32_matmul_precision('high')
-    int enable_tf32 = deviceProp.major >= 8 ? 1 : 0;
-    printf("enable_tf32: %d\n", enable_tf32);
-    cublas_compute_type = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F;
-    cublasMath_t cublas_math_mode = enable_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH;
-    cublasCheck(cublasSetMathMode(cublas_handle, cublas_math_mode));
-    // setup the (global) cuBLASLt workspace
-    cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size));
-
-    #ifdef ENABLE_CUDNN
-    checkCudnnErr(cudnnCreate(&cudnn_handle));
-    #endif
+    common_start();
 
     // build the GPT-2 model from a checkpoint
     GPT2 model;
@@ -91,16 +54,8 @@ int main() {
     gpt2_backward(&model);
     gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1);
     cudaCheck(cudaDeviceSynchronize()); // finish all CUDA work to get correct precise timings
-    // free
-    gpt2_free(&model);
-
-    #ifdef ENABLE_CUDNN
-    if (cudnn_workspace != NULL) { cudaCheck(cudaFree(cudnn_workspace)); }
-    checkCudnnErr(cudnnDestroy(cudnn_handle));
-    #endif
-    cudaCheck(cudaFree(cublaslt_workspace));
-    cublasCheck(cublasDestroy(cublas_handle));
-    cublasCheck(cublasLtDestroy(cublaslt_handle));
 
+    // free
+    common_free(model);
     return 0;
 }
diff --git a/profile_gpt2cu.py b/profile_gpt2cu.py
index b3eec863a..8e15b7dc2 100644
--- a/profile_gpt2cu.py
+++ b/profile_gpt2cu.py
@@ -31,7 +31,7 @@
     "dram__bytes_write.sum",                    # DRAM writes
     "lts__t_sectors_srcunit_tex_op_read.sum",   # L2 reads (sectors -- 32B)
     "lts__t_sectors_srcunit_tex_op_write.sum",  # L2 reads (sectors -- 32B)
-    "smsp__inst_executed.sum",                   # instructions
+    "sm__pipe_tensor_op_hmma_cycles_active.avg.pct_of_peak_sustained_active", # todo - tensor core %
 ]
 cmd = [NCU, "-i", "profile.ncu-rep", "--csv", "--page", "raw", "--metrics", ",".join(metrics)]
 result = subprocess.check_output(cmd, text=True).strip()
@@ -55,11 +55,11 @@
 for rid, row in enumerate(reader):
     if rid == 0:
         #  headings
-        print(f"id pass {'name':<40} {'time':>8} {'RAM rd':>8} {'RAM wt':>8} {'L2 rd':>8} {'L2 wt':>8} {'inst':>8}")
+        print(f"id pass {'name':<70} {'time':>8} {'RAM BW':>8} {'RAM rd':>8} {'RAM wt':>8} {'L2 rd':>8} {'L2 wt':>8} {'inst':>8}")
         continue
     if rid == 1:
         # units
-        units = f"        {'':<40} {'ms':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'MInst':>8}"
+        units = f"        {'':<70} {'ms':>8} {'GB/s':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'MInst':>8}"
         print(units)
         print("." * len(units))
         continue
@@ -74,7 +74,7 @@
     write = float(row[12])
     l2_read = float(row[14])
     l2_write = float(row[15])
-    inst = float(row[16]) / 1e6
+    inst = float(row[16])
 
     kid = rid - 2
 
@@ -118,18 +118,21 @@
     total['l2_write'] += l2_write
     total['inst'] += inst
 
-    print(f"{kid:02} {pass_name:4} {fn_name:<40} {time:8.2f} {read:8.2f} {write:8.2f} {l2_read:8.2f} {l2_write:8.2f} {inst:8.2f}")
+    dram_bw = (read + write) / (time / 1000.0);
+
+    print(f"{kid:02} {pass_name:4} {fn_name:<70} {time:8.2f} {dram_bw:8.1f} {read:8.2f} {write:8.2f} {l2_read:8.2f} {l2_write:8.2f} {inst:8.2f}")
 
 total_time = total['time']
+total_dram_bw = (total['read'] + total['write']) / (total_time / 1000.0);
 print("." * len(units))
-print(f"        {'Total':<40} {total['time']:8.2f} {total['read']:8.2f} {total['write']:8.2f} {total['l2_read']:8.2f} {total['l2_write']:8.2f} {total['inst']:8.2f}")
+print(f"        {'Total':<70} {total['time']:8.2f} {total_dram_bw:8.1f} {total['read']:8.2f} {total['write']:8.2f} {total['l2_read']:8.2f} {total['l2_write']:8.2f} {total['inst']:8.2f}")
 
 print()
 print("Kernel type summaries:")
-print(f"  {'name':<40} {'time':>6} {'frac':>6}")
+print(f"  {'name':<70} {'time':>6} {'frac':>6}")
 ordered = sorted(summaries.items(), key=lambda x: x[1], reverse=True)
 for entry, value in ordered:
-    print(f"  {entry:<40} {value:6.2f} {100*value / total_time:6.2f}%")
+    print(f"  {entry:<70} {value:6.2f} {100*value / total_time:6.2f}%")
 
 
 ts = total_time / 1000
diff --git a/test_gpt2.cu b/test_gpt2.cu
index 9c98f5684..3fc6b6f0e 100644
--- a/test_gpt2.cu
+++ b/test_gpt2.cu
@@ -83,44 +83,7 @@ float* float_cpu_malloc_and_point_parameters(FloatParameterTensors* params, size
 }
 
 int main(int argc, char *argv[]) {
-
-    // set up the device
-    int deviceIdx = 0;
-    cudaCheck(cudaSetDevice(deviceIdx));
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, deviceIdx);
-    printf("[System]\n");
-    printf("Device %d: %s\n", deviceIdx, deviceProp.name);
-
-    cuda_num_SMs = deviceProp.multiProcessorCount;
-    cuda_threads_per_SM = deviceProp.maxThreadsPerMultiProcessor;
-    cuda_arch_major = deviceProp.major;
-    cuda_arch_minor = deviceProp.minor;
-
-    cudaCheck(cudaStreamCreate(&main_stream));
-    cudaEventCreateWithFlags(&main_event, cudaEventDisableTiming);
-    cudaEventCreateWithFlags(&loss_event, cudaEventDisableTiming);
-    for (int i = 0; i < num_parallel_streams; i++) {
-        cudaCheck(cudaStreamCreate(&parallel_streams[i]));
-        cudaEventCreateWithFlags(&parallel_events[i], cudaEventDisableTiming);
-    }
-
-    // setup cuBLAS and cuBLASLt
-    cublasCheck(cublasCreate(&cublas_handle));
-    cublasCheck(cublasSetStream(cublas_handle, main_stream));
-    cublasCheck(cublasLtCreate(&cublaslt_handle));
-    // TF32 precision is equivalent to torch.set_float32_matmul_precision('high')
-    int enable_tf32 = cuda_arch_major >= 8 ? 1 : 0;
-    enable_tf32 = 0; // NOTE: disable TF32 for testing!!!
-    printf("enable_tf32: %d\n", enable_tf32);
-    cublas_compute_type = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F;
-    cublasMath_t cublas_math_mode = enable_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH;
-    cublasCheck(cublasSetMathMode(cublas_handle, cublas_math_mode));
-    cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size));
-
-    #ifdef ENABLE_CUDNN
-    checkCudnnErr(cudnnCreate(&cudnn_handle));
-    #endif
+    common_start(false);
 
     // build the GPT-2 model from a checkpoint
     GPT2 model;
@@ -327,6 +290,7 @@ int main(int argc, char *argv[]) {
     printf("overall okay: %d\n", allok);
 
     // free everything
+    common_free(model);
     free(x);
     free(y);
     free(logits_cpu_raw);
@@ -336,14 +300,5 @@ int main(int argc, char *argv[]) {
     free(expected_grads_memory);
     free(grads_memory_cpu);
     free(grads_memory_cpu_float);
-    gpt2_free(&model);
-    #ifdef ENABLE_CUDNN
-    if (cudnn_workspace != NULL) { cudaCheck(cudaFree(cudnn_workspace)); }
-    checkCudnnErr(cudnnDestroy(cudnn_handle));
-    #endif
-    cudaCheck(cudaFree(cublaslt_workspace));
-    cublasCheck(cublasDestroy(cublas_handle));
-    cublasCheck(cublasLtDestroy(cublaslt_handle));
-
     return 0;
 }
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 971e9be27..2a86fdec5 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -11,7 +11,7 @@ sure that those parts work out ok and that we do a += as necessary. E.g.,
 the layernorms are connected to the residuals so we += in layernorm backward.
 
 In this file we are using Mixed Precision training, so different activations,
-paramaters, grads and buffers may be kept at different precisions, to take
+parameters, grads and buffers may be kept at different precisions, to take
 advantage of the fast low-precision hardware in the latest GPUs (bf16/fp16),
 and fp8 (coming soon^TM).
 
@@ -33,26 +33,15 @@ This reads & runs in fp32, B=4, T=64, LR=1e-4, val/sample never (200),
 -a 1 is "overfit single batch", -x 10 is 10 iterations, and -f 0 disables tf32
 */
 
-#include <string>
-
 #include <stdio.h>
-#include <stdlib.h>
 #include <stdarg.h>
-#include <math.h>
-#include <time.h>
-#include <assert.h>
-#include <float.h>
-#include <string.h>
-#include <unistd.h>
-#include <assert.h>
+#include <string>
 // GPU / CUDA related
-#include <cublas_v2.h>
-#include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
+#include <cublas_v2.h>
 #include <cublasLt.h>
-#include <cuda_bf16.h>
 #include <nvtx3/nvToolsExt.h>
-
+#include <cuda_profiler_api.h>
 // Multi-GPU related
 #ifdef MULTI_GPU
 #include <mpi.h>
@@ -73,20 +62,12 @@ enum PrecisionMode {
     PRECISION_BF16
 };
 
-// Default Properties
-typedef float floatN;
-#define CUBLAS_LOWP_COMPUTE cublas_compute_type
-#ifdef MULTI_GPU
-const ncclDataType_t ncclFloatN = ncclFloat;
-#endif
-
 // Specific configurations based on the enabled precision
 #if defined(ENABLE_FP32)
 typedef float floatX;
 #define CUBLAS_LOWP CUDA_R_32F
 #define PRECISION_MODE PRECISION_FP32
 const char* load_filename = "gpt2_124M.bin";
-const char* precision_mode_str = "fp32";
 #ifdef MULTI_GPU
 const ncclDataType_t ncclFloatX = ncclFloat;
 #endif
@@ -97,7 +78,6 @@ typedef half floatX;
 #define CUBLAS_LOWP CUDA_R_16F
 #define PRECISION_MODE PRECISION_FP16
 const char* load_filename = "gpt2_124M.bin";
-const char* precision_mode_str = "fp16";
 #ifdef MULTI_GPU
 const ncclDataType_t ncclFloatX = ncclHalf;
 #endif
@@ -107,7 +87,6 @@ typedef __nv_bfloat16 floatX;
 #define CUBLAS_LOWP CUDA_R_16BF
 #define PRECISION_MODE PRECISION_BF16
 const char* load_filename = "gpt2_124M_bf16.bin"; // bf16 weights specific filename
-const char* precision_mode_str = "bf16";
 #ifdef MULTI_GPU
 const ncclDataType_t ncclFloatX = ncclBfloat16;
 #endif
@@ -121,11 +100,9 @@ namespace fe = cudnn_frontend;
 #else
 #define CUDNN_16BIT fe::DataType_t::HALF
 #endif
-
 static cudnnHandle_t cudnn_handle;
 static size_t cudnn_workspace_size = 0; // dynamically allocated as needed (up to 256MiB!)
 static void* cudnn_workspace = NULL;
-#define checkCudnnErr(err) assert((int)err == 0);
 #endif // ENABLE_CUDNN
 
 // ----------------------------------------------------------------------------
@@ -144,17 +121,14 @@ class NvtxRange {
 #define NVTX_RANGE_FN() NvtxRange nvtx_range(__FUNCTION__)
 
 // cuBLAS workspace. Hardcoding to 32MiB but only Hopper needs 32, for others 4 is OK
-static size_t cublaslt_workspace_size = 32 * 1024 * 1024;
-static void* cublaslt_workspace = NULL;
-static cublasComputeType_t cublas_compute_type;
-cublasHandle_t cublas_handle;
+const size_t cublaslt_workspace_size = 32 * 1024 * 1024;
+void* cublaslt_workspace = NULL;
+cublasComputeType_t cublas_compute = CUBLAS_COMPUTE_32F;
 cublasLtHandle_t cublaslt_handle;
-int cuda_arch_major = 0;
-int cuda_arch_minor = 0;
-int cuda_num_SMs = 0; // for persistent threads where we want 1 threadblock per SM
-int cuda_threads_per_SM = 0;
+cublasHandle_t cublas_handle;
+cudaDeviceProp deviceProp;
 
-// CUDA streams & events (note: non-timing events, use separate event for timing/profiling!)
+// CUDA streams & events (note: non-timing events, use separate events for timing/profiling!)
 constexpr int num_parallel_streams = 2; // + 1 primary "main_stream" (+ default stream)
 cudaStream_t parallel_streams[num_parallel_streams];
 cudaEvent_t parallel_events[num_parallel_streams];
@@ -168,8 +142,7 @@ cudaEvent_t loss_event; // to make sure fused_classifier has written the losses
 // CUDA error checking
 void cudaCheck(cudaError_t error, const char *file, int line) {
   if (error != cudaSuccess) {
-    printf("[CUDA ERROR] at file %s:%d:\n%s\n", file, line,
-           cudaGetErrorString(error));
+    printf("[CUDA ERROR] at file %s:%d:\n%s\n", file, line, cudaGetErrorString(error));
     exit(EXIT_FAILURE);
   }
 };
@@ -206,35 +179,6 @@ void mpi_check(int status, const char *file, int line) {
 #define mpiCheck(err) (mpi_check(err, __FILE__, __LINE__))
 #endif
 
-// GPU helper functions for atomicAdd on smaller than 32-bit types
-#ifdef ENABLE_BF16
-__device__ void atomicAddX(__nv_bfloat16* addr, __nv_bfloat16 val) {
-    uintptr_t ptr_val = reinterpret_cast<uintptr_t>(addr);
-    __nv_bfloat162* ptr_bf16 = reinterpret_cast<__nv_bfloat162*>(ptr_val & ~uintptr_t(0x3));
-
-    // Prepare the value to add, setting the other half to zero
-    __nv_bfloat162 add_val = (ptr_val & 0x3) ? __halves2bfloat162(__ushort_as_bfloat16(0), val)
-                                             : __halves2bfloat162(val, __ushort_as_bfloat16(0));
-    atomicAdd(ptr_bf16, add_val);
-}
-#endif
-
-#ifdef ENABLE_FP16
-__device__ void atomicAddX(half* addr, half val) {
-    uintptr_t ptr_val = reinterpret_cast<uintptr_t>(addr);
-    half2* ptr_fp16 = reinterpret_cast<half2*>(ptr_val & ~uintptr_t(0x3));
-
-    // Prepare the value to add, setting the other half to zero
-    half2 add_val = (ptr_val & 0x3) ? __halves2half2(__ushort_as_half(0), val)
-                                    : __halves2half2(val, __ushort_as_half(0));
-    atomicAdd(ptr_fp16, add_val);
-}
-#endif
-
-__device__ void atomicAddX(float* addr, float val) {
-    atomicAdd(addr, val);
-}
-
 // warp-level reduction for summing values
 __device__ float warpReduceSum(float val) {
     for (int offset = 16; offset > 0; offset /= 2) {
@@ -242,7 +186,6 @@ __device__ float warpReduceSum(float val) {
     }
     return val;
 }
-
 // warp-level reduction for finding the maximum value
 __device__ float warpReduceMax(float val) {
     for (int offset = 16; offset > 0; offset /= 2) {
@@ -250,16 +193,6 @@ __device__ float warpReduceMax(float val) {
     }
     return val;
 }
-
-#if defined(ENABLE_BF16) || defined(ENABLE_FP16)
-__device__ floatX warpReduceSum(floatX val) {
-    for (int offset = 16; offset > 0; offset /= 2) {
-        val += __shfl_xor_sync(0xFFFFFFFF, val, offset);
-    }
-    return val;
-}
-#endif
-
 // requires all 32 threads in the warp to be active, but should work for any block size
 // uses non-dynamic shared memory so every call increases shared memory requirements by 128 bytes
 // the fact it's unique shared memory allows us to avoid an extra __syncthreads() call at the end
@@ -270,14 +203,13 @@ __device__ float blockReduce(float val, bool final_sync=false, float out_of_boun
     // two reductions of up to 1024 threads:
     // 1) inside warp (shuffle), 2) cross-warp (shared memory), 3) inside warp (shuffle)
     __shared__ float shared_val[32];
-    int lane_id = threadIdx.x % 32;
-    int warp_id = threadIdx.x / 32;
-    int num_warps = blockDim.x / 32;
+    const int lane_id = threadIdx.x % 32;
+    const int warp_id = threadIdx.x / 32;
+    const int num_warps = blockDim.x / 32;
 
     float warp_val = warp_reduction(val);
     if (lane_id == 0) { shared_val[warp_id] = warp_val; }
     __syncthreads();
-    // same strategy, now reduce across warps
     warp_val = (lane_id < num_warps) ? shared_val[lane_id] : out_of_bounds;
     float block_val = warp_reduction(warp_val);
 
@@ -287,7 +219,6 @@ __device__ float blockReduce(float val, bool final_sync=false, float out_of_boun
     return block_val;
 }
 
-
 // ----------------------------------------------------------------------------
 // Packed128 data structure, which forces the compiler to use 128-bit loads/stores
 // in GPUs that support (the LDG.128 and STS.128 instructions)
@@ -296,12 +227,11 @@ __device__ float blockReduce(float val, bool final_sync=false, float out_of_boun
 
 template<class ElementType>
 struct alignas(16) Packed128 {
-    __device__ Packed128() = default;
+    Packed128() = default;
     __device__ explicit Packed128(int4 bits) {
         static_assert(sizeof(bits) == sizeof(payload), "Size mismatch.");
         memcpy(&payload, &bits, sizeof(bits));
     }
-
     __device__ ElementType& operator[](int index) {
         return payload[index];
     }
@@ -314,39 +244,35 @@ struct alignas(16) Packed128 {
         memcpy(&bits, &payload, sizeof(bits));
         return bits;
     }
-
     static constexpr const size_t size = sizeof(int4) / sizeof(ElementType);
     ElementType payload[size];
 };
 
-// short-form typedef
-typedef Packed128<float> f128;
-typedef Packed128<floatX> x128;
-
 // load a Packed128 from an aligned memory address
 template<class ElementType>
 __device__ Packed128<ElementType> load128(const ElementType* address) {
     return Packed128<ElementType>{*reinterpret_cast<const int4*>(address)};
 }
-
 // load a Packed128 from an aligned memory address with streaming cache hint
 template<class ElementType>
 __device__ Packed128<ElementType> load128cs(const ElementType* address) {
     return Packed128<ElementType>{__ldcs(reinterpret_cast<const int4*>(address))};
 }
-
 // store a Packed128 to an aligned memory address
 template<class ElementType>
 __device__ void store128(ElementType* target, Packed128<ElementType> value) {
     *reinterpret_cast<int4*>(target) = value.get_bits();
 }
-
 // store a Packed128 to an aligned memory address with streaming cache hint
 template<class ElementType>
 __device__ void store128cs(ElementType* target, Packed128<ElementType> value) {
     __stcs(reinterpret_cast<int4*>(target), value.get_bits());
 }
 
+// short-form typedefs
+typedef Packed128<float> f128;
+typedef Packed128<floatX> x128;
+
 // ----------------------------------------------------------------------------
 // Random Number Generatiom
 
@@ -387,25 +313,11 @@ __device__ __host__ constexpr unsigned int SquirrelNoise5(int positionX, unsigne
 	mangledBits ^= (mangledBits >> 17);
 	return mangledBits;
 }
-__device__ __host__ constexpr unsigned int Get1dNoiseUint(int positionX, unsigned int seed)
-{
-	return SquirrelNoise5(positionX, seed);
-}
 __device__ __host__ constexpr unsigned int Get2dNoiseUint(int indexX, int indexY, unsigned int seed)
 {
 	constexpr int PRIME_NUMBER = 198491317; // Large prime number with non-boring bits
 	return SquirrelNoise5(indexX + (PRIME_NUMBER * indexY), seed);
 }
-__device__ __host__ constexpr float Get1dNoiseZeroToOne(int index, unsigned int seed)
-{
-	constexpr double ONE_OVER_MAX_UINT = (1.0 / (double) 0xFFFFFFFF);
-	return (float)(ONE_OVER_MAX_UINT * (double) SquirrelNoise5(index, seed));
-}
-__device__ __host__ constexpr float Get2dNoiseZeroToOne(int indexX, int indexY, unsigned int seed)
-{
-	constexpr double ONE_OVER_MAX_UINT = (1.0 / (double) 0xFFFFFFFF);
-	return (float)(ONE_OVER_MAX_UINT * (double) Get2dNoiseUint(indexX, indexY, seed));
-}
 
 // stochastic rounding built on top of Squirel Noise above (with seed updated per step via xorshift)
 __device__ __forceinline__ void stochastic_rounding(float in, __nv_bfloat16 *out, unsigned int seed) {
@@ -564,20 +476,16 @@ auto lookup_cache_or_build_graph_fwd(Args... args) {
           .set_compute_data_type(fe::DataType_t::FLOAT);
 
     // QKV is (B, T, 3, NH, HS) which cuDNN can handle directly without an external permute
-    auto Q = graph->tensor(fe::graph::Tensor_attributes()
-                               .set_name("Q")
+    auto Q = graph->tensor(fe::graph::Tensor_attributes().set_name("Q")
                                .set_dim({B, H, T, HS})
                                .set_stride({3 * H * HS * T,  HS, 3 * H * HS, 1}));
-    auto K = graph->tensor(fe::graph::Tensor_attributes()
-                               .set_name("K")
+    auto K = graph->tensor(fe::graph::Tensor_attributes().set_name("K")
                                .set_dim({B, H, T, HS})
                                .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1}));
-    auto V = graph->tensor(fe::graph::Tensor_attributes()
-                               .set_name("V")
+    auto V = graph->tensor(fe::graph::Tensor_attributes().set_name("V")
                                .set_dim({B, H, T, HS})
                                .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1}));
-    auto attn_scale = graph->tensor(fe::graph::Tensor_attributes()
-                                .set_name("attn_scale")
+    auto attn_scale = graph->tensor(fe::graph::Tensor_attributes().set_name("attn_scale")
                                 .set_dim({1, 1, 1, 1})
                                 .set_stride({1, 1, 1, 1})
                                 .set_is_pass_by_value(true)
@@ -613,6 +521,7 @@ auto lookup_cache_or_build_graph_fwd(Args... args) {
     auto plans = graph->create_execution_plans({fe::HeurMode_t::A});
     assert(graph->check_support(cudnn_handle).is_good());
     assert(graph->build_plans(cudnn_handle).is_good());
+    assert(graph->get_workspace_size() <= cudnn_workspace_size); // fwd shouldn't need workspace
 
     auto tuple = std::make_tuple(graph, Q, K, V, attn_scale, O, stats);
     user_maintained_cache_fwd.insert({key, tuple});
@@ -631,40 +540,32 @@ auto lookup_cache_or_build_graph_bwd(Args... args) {
 
     // (B, N, 3, NH, HS)
     // must come from inp (which means we also need to convert THAT to FP16)
-    auto Q = graph->tensor(fe::graph::Tensor_attributes()
-                            .set_name("Q")
+    auto Q = graph->tensor(fe::graph::Tensor_attributes().set_name("Q")
                             .set_dim({B, NH, T, HS})
                             .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}));
-    auto K = graph->tensor(fe::graph::Tensor_attributes()
-                            .set_name("K")
+    auto K = graph->tensor(fe::graph::Tensor_attributes().set_name("K")
                             .set_dim({B, NH, T, HS})
                             .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}));
-    auto V = graph->tensor(fe::graph::Tensor_attributes()
-                            .set_name("V")
+    auto V = graph->tensor(fe::graph::Tensor_attributes().set_name("V")
                             .set_dim({B, NH, T, HS})
                             .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}));
-    auto O = graph->tensor(fe::graph::Tensor_attributes()
-                            .set_name("O")
+    auto O = graph->tensor(fe::graph::Tensor_attributes().set_name("O")
                             .set_dim({B, NH, T, HS})
                             .set_stride({NH * HS * T, HS, NH * HS, 1}));
-    auto dO = graph->tensor(fe::graph::Tensor_attributes()
-                            .set_name("dO")
+    auto dO = graph->tensor(fe::graph::Tensor_attributes().set_name("dO")
                             .set_dim({B, NH, T, HS})
                             .set_stride({NH * HS * T, HS, NH * HS, 1}));
 
-    auto stats = graph->tensor(fe::graph::Tensor_attributes()
-                            .set_name("stats")
+    auto stats = graph->tensor(fe::graph::Tensor_attributes().set_name("stats")
                             .set_dim({B, NH, T, 1})
                             .set_stride({NH * T, T, 1, 1})
                             .set_data_type(fe::DataType_t::FLOAT));
-    auto attn_scale = graph->tensor(fe::graph::Tensor_attributes()
-                            .set_name("attn_scale")
+    auto attn_scale = graph->tensor(fe::graph::Tensor_attributes().set_name("attn_scale")
                             .set_dim({1, 1, 1, 1})
                             .set_stride({1, 1, 1, 1})
                             .set_is_pass_by_value(true)
                             .set_data_type(fe::DataType_t::FLOAT));
-    auto sdpa_backward_options = fe::graph::SDPA_backward_attributes()
-                            .set_name("flash_attention_backward")
+    auto sdpa_backward_options = fe::graph::SDPA_backward_attributes().set_name("flash_attention_backward")
                             .set_causal_mask(true)
                             .set_attn_scale(attn_scale);
 
@@ -688,6 +589,16 @@ auto lookup_cache_or_build_graph_bwd(Args... args) {
     assert(graph->check_support(cudnn_handle).is_good());
     assert(graph->build_plans(cudnn_handle).is_good());
 
+    // Reallocate the workspace if the required size is greater than the current workspace
+    // By default, cuDNN uses up to 256MiB of workspace, so we don't want to just allocate the maximum
+    if (graph->get_workspace_size() > cudnn_workspace_size) {
+        if (cudnn_workspace_size > 0) {
+            cudaCheck(cudaFree(cudnn_workspace));
+        }
+        cudnn_workspace_size = graph->get_workspace_size();
+        cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size));
+    }
+
     auto tuple = std::make_tuple(graph, Q, K, V, O, dO, stats, attn_scale, dQ, dK, dV);
     user_maintained_cache_bwd.insert({key, tuple});
     return tuple;
@@ -721,16 +632,6 @@ void attention_forward_cudnn(floatX* out,  // output: (B, T, NH, HS)
         variant_pack[softmax_stats] = stats;
     }
 
-    // Reallocate the workspace if the required size is greater than the current workspace
-    // By default, cuDNN uses up to 256MiB of workspace, so we don't want to just allocate the maximum
-    if (graph->get_workspace_size() > cudnn_workspace_size) {
-        if (cudnn_workspace_size > 0) {
-            cudaCheck(cudaFree(cudnn_workspace));
-        }
-        cudnn_workspace_size = graph->get_workspace_size();
-        cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size));
-    }
-
     // Execute graph
     assert(graph->execute(cudnn_handle, variant_pack, cudnn_workspace).is_good());
     cudaCheck(cudaGetLastError());
@@ -765,16 +666,6 @@ void attention_backward_cudnn(floatX* dqkvr,
         {dQ, devPtrdQ}, {dK, devPtrdK}, {dV, devPtrdV},
         {attn_scale, &attn_scale_cpu}};
 
-    // Reallocate the workspace if the required size is greater than the current workspace
-    // By default, cuDNN uses up to 256MiB of workspace, so we don't want to just allocate the maximum
-    if (graph->get_workspace_size() > cudnn_workspace_size) {
-        if (cudnn_workspace_size > 0) {
-            cudaCheck(cudaFree(cudnn_workspace));
-        }
-        cudnn_workspace_size = graph->get_workspace_size();
-        cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size));
-    }
-
     // Execute graph
     assert(graph->execute(cudnn_handle, variant_pack, cudnn_workspace).is_good());
     cudaCheck(cudaGetLastError());
@@ -789,51 +680,70 @@ __global__ void encoder_forward_kernel3(floatX* out,
                                int B, int T, int C) {
     int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
     int N = B * T * C;
-    if (idx < N) {
-        int bt = idx / C;
-        int b = bt / T;
-        int t = bt % T;
-        int c = idx % C;
-
-        int ix = inp[b * T + t];
-
-        floatX* out_btc = out + b * T * C + t * C + c;
-        const floatX* wte_ix = wte + ix * C + c;
-        const floatX* wpe_tc = wpe + t * C + c;
-
-        x128 packed_out;
-        x128 wte = load128cs(wte_ix);
-        x128 wpe = load128cs(wpe_tc);
-        #pragma unroll
-        for (int k = 0; k < wte.size; k++) {
-            packed_out[k] = (floatX)((float)wte[k] + (float)wpe[k]);
-        }
-        store128(out_btc, packed_out);
+    if (idx >= N) { return; }
+
+    int bt = idx / C;
+    int b = bt / T;
+    int t = bt % T;
+    int c = idx % C;
+
+    int ix = inp[b * T + t];
+
+    floatX* out_btc = out + b * T * C + t * C + c;
+    const floatX* wte_ix = wte + ix * C + c;
+    const floatX* wpe_tc = wpe + t * C + c;
+
+    x128 packed_out;
+    x128 wte128 = load128cs(wte_ix);
+    x128 wpe128 = load128cs(wpe_tc);
+    for (int k = 0; k < x128::size; k++) {
+        packed_out[k] = (floatX)((float)wte128[k] + (float)wpe128[k]);
     }
+    store128(out_btc, packed_out);
+}
+
+__device__ void atomicStochasticAdd(__nv_bfloat16* address, float val0, float val1, uint seed) {
+    float2 val = make_float2(val0, val1);
+    uint* address_as_uint = (uint*)address;
+    uint old = *address_as_uint, assumed;
+    uint random = Get2dNoiseUint(threadIdx.x, blockIdx.x, seed);
+    do {
+        assumed = old;
+        float2 old_fp32 = __bfloat1622float2(*(__nv_bfloat162*)&old);
+        float2 new_fp32 = make_float2(old_fp32.x + val.x, old_fp32.y + val.y);
+        __nv_bfloat162 new_bf16;
+        stochastic_rounding(new_fp32.x, &new_bf16.x, random);
+        stochastic_rounding(new_fp32.y, &new_bf16.y, random >> 16);
+        old = atomicCAS(address_as_uint, assumed, *(uint*)&new_bf16);
+    } while (assumed != old);
+}
+__device__ void atomicStochasticAdd(float* address, float val0, float val1, uint seed) {
+    atomicAdd(address, val0);
+    atomicAdd(address + 1, val1);
 }
 
-// really bad naive kernel with atomicAdd
 __global__ void encoder_backward_kernel(floatX* dwte, floatX* dwpe,
                                         const floatX* dout, const int* inp,
-                                        int B, int T, int C) {
+                                        int B, int T, int C, uint seed) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     int N = B * T * C;
+    idx *= 2; // 2 elements per thread
+    if (idx >= N) { return; }
 
-    if (idx < N) {
-        int bt = idx / C;
-        int b = bt / T;
-        int t = bt % T;
-        int c = idx % C;
+    int bt = idx / C;
+    int b = bt / T;
+    int t = bt % T;
+    int c = idx % C;
 
-        int ix = inp[b * T + t];
+    int ix = inp[b * T + t];
 
-        const floatX* dout_btc = dout + b * T * C + t * C + c;
-        floatX* dwte_ix = dwte + ix * C + c;
-        floatX* dwpe_tc = dwpe + t * C + c;
+    const floatX* dout_btc = dout + b * T * C + t * C + c;
+    floatX* dwte_ix = dwte + ix * C + c;
+    floatX* dwpe_tc = dwpe + t * C + c;
 
-        atomicAddX(dwte_ix, (floatX)*dout_btc);
-        atomicAddX(dwpe_tc, (floatX)*dout_btc);
-    }
+    float2 dout_data = make_float2(dout_btc[0], dout_btc[1]);
+    atomicStochasticAdd(dwte_ix, dout_data.x, dout_data.y, seed);
+    atomicStochasticAdd(dwpe_tc, dout_data.x, dout_data.y, seed ^ 0xFFFFFFFF);
 }
 
 __global__ void layernorm_forward_kernel3(floatX* __restrict__ out, floatX* __restrict__ mean, floatX* __restrict__ rstd,
@@ -891,38 +801,38 @@ __global__ void permute_kernel(floatX* q, floatX* k, floatX* v,
     // okay so now, this kernel wants Q,K,V to all be of shape (B, NH, N, d)
     // but instead, we have a single tensor QKV (inp) of shape (B, N, 3, NH, d)
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= B * NH * N * d) { return; }
+
     // Q[b][nh_][n][d_] = inp[b][n][0][nh_][d_]
-    if (idx < B * NH * N * d) {
-        int b = idx / (NH * N * d);
-        int rest = idx % (NH * N * d);
-        int nh_ = rest / (N * d);
-        rest = rest % (N * d);
-        int n = rest / d;
-        int d_ = rest % d;
-        int inp_idx = (b * N * 3 * NH * d) + (n * 3 * NH * d) + (0 * NH * d) + (nh_ * d) + d_;
-        q[idx] = __ldcs(&inp[inp_idx]);
-        k[idx] = __ldcs(&inp[inp_idx + NH * d]);
-        v[idx] = __ldcs(&inp[inp_idx + 2 * (NH * d)]);
-    }
+    int b = idx / (NH * N * d);
+    int rest = idx % (NH * N * d);
+    int nh_ = rest / (N * d);
+    rest = rest % (N * d);
+    int n = rest / d;
+    int d_ = rest % d;
+    int inp_idx = (b * N * 3 * NH * d) + (n * 3 * NH * d) + (0 * NH * d) + (nh_ * d) + d_;
+    q[idx] = __ldcs(&inp[inp_idx]);
+    k[idx] = __ldcs(&inp[inp_idx + NH * d]);
+    v[idx] = __ldcs(&inp[inp_idx + 2 * (NH * d)]);
 }
 
 __global__ void permute_kernel_backward(floatX* dinp,
                                         const floatX* dq, const floatX* dk, const floatX* dv,
                                         int B, int N, int NH, int d) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < B * NH * N * d) {
-        int b = idx / (NH * N * d);
-        int rest = idx % (NH * N * d);
-        int nh_ = rest / (N * d);
-        rest = rest % (N * d);
-        int n = rest / d;
-        int d_ = rest % d;
+    if (idx >= B * NH * N * d) { return; }
 
-        int inp_idx = (b * N * 3 * NH * d) + (n * 3 * NH * d) + (0 * NH * d) + (nh_ * d) + d_;
-        dinp[inp_idx] = dq[idx];
-        dinp[inp_idx + NH * d] = dk[idx];
-        dinp[inp_idx + 2 * (NH * d)] = dv[idx];
-    }
+    int b = idx / (NH * N * d);
+    int rest = idx % (NH * N * d);
+    int nh_ = rest / (N * d);
+    rest = rest % (N * d);
+    int n = rest / d;
+    int d_ = rest % d;
+
+    int inp_idx = (b * N * 3 * NH * d) + (n * 3 * NH * d) + (0 * NH * d) + (nh_ * d) + d_;
+    dinp[inp_idx] = dq[idx];
+    dinp[inp_idx + NH * d] = dk[idx];
+    dinp[inp_idx + 2 * (NH * d)] = dv[idx];
 }
 
 __global__ void unpermute_kernel(floatX* inp, floatX *out, int B, int N, int NH, int d) {
@@ -930,30 +840,30 @@ __global__ void unpermute_kernel(floatX* inp, floatX *out, int B, int N, int NH,
 
     int idx = (blockIdx.x * blockDim.x + threadIdx.x);
     // out[b][n][nh_][d_] <- inp[b][nh_][n][d_]
-    if (idx < B * NH * N * d) {
-        int b = idx / (NH * N * d);
-        int rest = idx % (NH * N * d);
-        int nh_ = rest / (N * d);
-        rest = rest % (N * d);
-        int n = rest / d;
-        int d_ = rest % d;
-        int other_idx = (b * NH * N * d) + (n * NH * d) + (nh_ * d) + d_;
-        out[other_idx] = __ldcs(&inp[idx]);
-    }
+    if (idx >= B * NH * N * d) { return; }
+
+    int b = idx / (NH * N * d);
+    int rest = idx % (NH * N * d);
+    int nh_ = rest / (N * d);
+    rest = rest % (N * d);
+    int n = rest / d;
+    int d_ = rest % d;
+    int other_idx = (b * NH * N * d) + (n * NH * d) + (nh_ * d) + d_;
+    out[other_idx] = __ldcs(&inp[idx]);
 }
 
 __global__ void unpermute_kernel_backward(floatX* dinp, const floatX *dout, int B, int N, int NH, int d) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < B * NH * N * d) {
-        int b = idx / (NH * N * d);
-        int rest = idx % (NH * N * d);
-        int nh_ = rest / (N * d);
-        rest = rest % (N * d);
-        int n = rest / d;
-        int d_ = rest % d;
-        int other_idx = (b * NH * N * d) + (n * NH * d) + (nh_ * d) + d_;
-        dinp[idx] = (floatX)dout[other_idx];
-    }
+    if (idx >= B * NH * N * d) { return; }
+
+    int b = idx / (NH * N * d);
+    int rest = idx % (NH * N * d);
+    int nh_ = rest / (N * d);
+    rest = rest % (N * d);
+    int n = rest / d;
+    int d_ = rest % d;
+    int other_idx = (b * NH * N * d) + (n * NH * d) + (nh_ * d) + d_;
+    dinp[idx] = (floatX)dout[other_idx];
 }
 
 __global__ void softmax_forward_kernel5(floatX* out, float inv_temperature, const floatX* inp, int N, int T) {
@@ -983,13 +893,13 @@ __global__ void softmax_forward_kernel5(floatX* out, float inv_temperature, cons
     const floatX* x = inp + idx * T;
 
     // not INF, so we don't get NaNs accidentally when subtracting two values.
+    const float FLT_MAX = 340282346638528859811704183484516925440.0f; // to avoid including float.h
     float maxval = -FLT_MAX;
     float sumval = 0.0f;
 
     const floatX* x_aligned = reinterpret_cast<const floatX*>(__builtin_assume_aligned(x, 16));
     for (int i = lane_id; i < pos_by_4; i += warp_size) {
         float regarray[4];
-        #pragma unroll
         for (int k = 0; k < 4; ++k) {
             regarray[k] = (float)x_aligned[4*i + k];
         }
@@ -1026,67 +936,61 @@ __global__ void softmax_forward_kernel5(floatX* out, float inv_temperature, cons
 
 __global__ void residual_forward_kernel(floatX* out, floatX* inp1, floatX* inp2, int N) {
     int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
-    if (idx < N) {
-        x128 packed_out;
-        x128 packed_inp1 = load128cs(inp1 + idx);
-        x128 packed_inp2 = load128cs(inp2 + idx);
-        #pragma unroll
-        for (int k = 0; k < packed_inp1.size; k++) {
-            packed_out[k] = (floatX)((float)packed_inp1[k] + (float)packed_inp2[k]);
-        }
-        store128(out + idx, packed_out);
+    if (idx >= N) { return; }
+
+    x128 packed_out;
+    x128 packed_inp1 = load128cs(inp1 + idx);
+    x128 packed_inp2 = load128cs(inp2 + idx);
+    for (int k = 0; k < packed_inp1.size; k++) {
+        packed_out[k] = (floatX)((float)packed_inp1[k] + (float)packed_inp2[k]);
     }
+    store128(out + idx, packed_out);
 }
 
 #define GELU_SCALING_FACTOR sqrtf(2.0f / M_PI)
 __global__ void gelu_forward_kernel2(floatX* out, const floatX* inp, int N) {
-    int i = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
-    if (i < N) {
-        x128 packed_out;
-        x128 packed_inp = load128cs(inp + i); // load and do not keep in cache
-        for(int k = 0; k < packed_inp.size; ++k) {
-            float xi = (float)packed_inp[k];
-            float cube = 0.044715f * xi * xi * xi;
-            packed_out[k] = (floatX)(0.5f * xi * (1.0f + tanhf(GELU_SCALING_FACTOR * (xi + cube))));
-        }
-        // store instead of storecs (without cache streaming) in case it is useful for the
-        // data to be in the cache for the next operation after this GeLU
-        store128(out + i, packed_out);
+    int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
+    if (idx >= N) { return; }
+
+    x128 packed_out;
+    x128 packed_inp = load128cs(inp + idx); // load and do not keep in cache
+    for(int k = 0; k < packed_inp.size; ++k) {
+        float xi = (float)packed_inp[k];
+        float cube = 0.044715f * xi * xi * xi;
+        packed_out[k] = (floatX)(0.5f * xi * (1.0f + tanhf(GELU_SCALING_FACTOR * (xi + cube))));
     }
+    // store instead of storecs (without cache streaming) in case it is useful for the
+    // data to be in the cache for the next operation after this GeLU
+    store128(out + idx, packed_out);
 }
 
 __global__ void gelu_backward_kernel(floatX* dinp, const floatX* inp, const floatX* dout, const int N) {
-    int i = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
-    if (i < N) {
-        x128 packed_dinp;
-        x128 packed_inp = load128cs(inp + i);
-        x128 packed_dout = load128cs(dout + i);
-        for (int k = 0; k < packed_inp.size; ++k) {
-            float x = (float)packed_inp[k];
-            float cube = 0.044715f * x * x * x;
-            float tanh_arg = GELU_SCALING_FACTOR * (x + cube);
-            float tanh_out = tanhf(tanh_arg);
-            float coshf_out = coshf(tanh_arg);
-            float sech_out = 1.0f / (coshf_out * coshf_out);
-            float local_grad = 0.5f * (1.0f + tanh_out) + x * 0.5f * sech_out * GELU_SCALING_FACTOR * (1.0f + 3.0f * 0.044715f * x * x);
-            packed_dinp[k] = (floatX)(local_grad * (float)packed_dout[k]);
-        }
+    int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
+    if (idx >= N) { return; }
 
-        store128(dinp + i, packed_dinp);
+    x128 packed_dinp;
+    x128 packed_inp = load128cs(inp + idx);
+    x128 packed_dout = load128cs(dout + idx);
+    for (int k = 0; k < packed_inp.size; ++k) {
+        float x = (float)packed_inp[k];
+        float cube = 0.044715f * x * x * x;
+        float tanh_arg = GELU_SCALING_FACTOR * (x + cube);
+        float tanh_out = tanhf(tanh_arg);
+        float coshf_out = coshf(tanh_arg);
+        float sech_out = 1.0f / (coshf_out * coshf_out);
+        float local_grad = 0.5f * (1.0f + tanh_out) + x * 0.5f * sech_out * GELU_SCALING_FACTOR * (1.0f + 3.0f * 0.044715f * x * x);
+        packed_dinp[k] = (floatX)(local_grad * (float)packed_dout[k]);
     }
+    store128(dinp + idx, packed_dinp);
 }
 
 __global__ void matmul_backward_bias_kernel6(float* dbias, const floatX* dout, int B, int T, int OC) {
     // note: this kernel reads in floatX, but it writes to float!
     // this is because we're using atomics, which are super slow in < fp32 precision on < H100 GPUs
     // so the trick is do fp32 atomics to a buffer, and then copy_and_cast the result to floatX
+    // (this also results in higher accuracy than doing doing accumulation directly in floatX)
 
-    // Each warp is responsible for 32 * "x128::size" = 256 OCs at BF16 (OC must be a multiple of 256!)
-    // Block size is 512 threads (16 warps) and we reduce those 16 values into 1 at the end
-    // blockDim.x is 32 --> single warp being responsible for those 256 OCs
-    // blockDim.y is 16 --> 16 parallel independent warps processing the same OCs for different BTs
-    // gridDim.x is OC / 256 --> each block processes 256 OCs
-    // grimDim.y is max(1, (cuda_num_SMs * cuda_threads_per_SM) / (512 * gridDim.x)); --> fill up the entire GPU!
+    // see comments in matmul_backward() for an explanation of block/grid dimensions etc.
     const int block_size = 512;
     const int block_size_x = 32;
     const int block_size_y = block_size / block_size_x; // 16
@@ -1101,26 +1005,23 @@ __global__ void matmul_backward_bias_kernel6(float* dbias, const floatX* dout, i
         accumulators[k] = 0.0f;
     }
     int thread_id = threadIdx.y * block_size_x + threadIdx.x;
-    for (int i = thread_id; i < OC_per_warp; i += block_size) {
-        shared[i] = 0.0f;
+    for (int idx = thread_id; idx < OC_per_warp; idx += block_size) {
+        shared[idx] = 0.0f;
     }
     __syncthreads();
-    for (int i = blockIdx.y*block_size_y + threadIdx.y; i < B * T; i += gridDim.y*block_size_y) {
-        x128 packed_dout = load128(dout + global_oc + i*OC);
+    for (int idx = blockIdx.y*block_size_y + threadIdx.y; idx < B * T; idx += gridDim.y*block_size_y) {
+        x128 packed_dout = load128(dout + global_oc + idx*OC);
         for (int k = 0; k < x128::size; k++) {
-            //printf("%d: %f + %f\n", oc, accumulators[k], (float)packed_dout[k]);
             accumulators[k] += (float)packed_dout[k];
         }
-        //__syncthreads(); // keep block synchronised to maximise memory locality (?)
     }
     for (int k = 0; k < x128::size; k++) {
         atomicAdd(shared + local_oc + k, accumulators[k]);
     }
     __syncthreads();
     if (threadIdx.y == 0) {
-        for (int i = threadIdx.x; i < OC_per_warp; i += block_size_x) {
-            //printf("%d => %f\n", i, shared[i]);
-            atomicAdd(dbias + i + blockIdx.x*OC_per_warp, shared[i]);
+        for (int idx = threadIdx.x; idx < OC_per_warp; idx += block_size_x) {
+            atomicAdd(dbias + idx + blockIdx.x*OC_per_warp, shared[idx]);
         }
     }
 }
@@ -1140,7 +1041,6 @@ __global__ void layernorm_backward_kernel7(floatX* dinp, floatX* dweight, floatX
     float* dweight_shared = shared + C;
 
     // init shared memory to zero
-    #pragma unroll 4
     for(int i = threadIdx.x; i < C; i+= blockDim.x){
        dbias_shared[i] = 0.0f;
        dweight_shared[i] = 0.0f;
@@ -1167,12 +1067,11 @@ __global__ void layernorm_backward_kernel7(floatX* dinp, floatX* dweight, floatX
             dnorm_mean += dnorm_i;
             dnorm_norm_mean += dnorm_i * norm_bti;
         }
-        dnorm_mean = warpReduceSum(dnorm_mean);
-        dnorm_norm_mean = warpReduceSum(dnorm_norm_mean);
-        dnorm_mean = dnorm_mean / C;
-        dnorm_norm_mean = dnorm_norm_mean / C;
+        dnorm_mean = warpReduceSum(dnorm_mean) / C;
+        dnorm_norm_mean = warpReduceSum(dnorm_norm_mean) / C;
 
         // now iterate again and accumulate all the gradients
+        // todo - use x128 for this loop to improve performance
         for (int i = warpThreadIdx; i < C; i += warpSize) {
             float dout_i = (float)__ldcs(&dout_bt[i]);
             float norm_bti = ((float)__ldcs(&inp_bt[i]) - mean_bt) * rstd_bt;
@@ -1193,7 +1092,7 @@ __global__ void layernorm_backward_kernel7(floatX* dinp, floatX* dweight, floatX
 
     // Accumulate into a FP32 scratchpad
     // BF16 atomics are potentially much slower... and this is more precise!
-    // todo - could potentially avoid the extra copy if floatX is FP32, fairly negligible though
+    // todo - could avoid the extra copy if floatX is FP32, fairly negligible though
     __syncthreads();
     float* scratch_dbias = scratch;
     float* scratch_dweight = scratch + C;
@@ -1221,9 +1120,9 @@ __global__ void softmax_autoregressive_backward_kernel(floatX* dpreatt, const fl
     constexpr const int BlockSize = 256;
     constexpr int T_per_block = 4;
 
-    int idx = blockIdx.y;
     // go through blocks in reverse order, so the slowest block starts first
     int t0 = T - 1 - T_per_block*blockIdx.x;
+    int idx = blockIdx.y;
 
     att += idx * T * T;
     datt += idx * T * T;
@@ -1254,41 +1153,40 @@ __global__ void softmax_autoregressive_backward_kernel(floatX* dpreatt, const fl
 
 // Implements linear interpolation using only two floating-point operations (as opposed to three in a naive implementation).
 // Reference: https://developer.nvidia.com/blog/lerp-faster-cuda
-__device__ inline float lerp(float start, float end, float weight) {
+__device__ float lerp(float start, float end, float weight) {
     return fma(weight, end, fma(-weight, start, start));
 }
 
-// Termplate type T instead of floatx
 template <typename Tp, typename Tg>
 __global__ void adamw_kernel3(Tp* params_memory, float* master_params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters,
                               float learning_rate, float beta1, float beta2, float beta1_correction, float beta2_correction, float eps, float weight_decay,
                               unsigned int seed) {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= num_parameters) return;  // guard
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= num_parameters) { return; }  // guard
     // get the gradient, m, and v for this parameter
-    float grad = (float)grads_memory[i];
-    float m = m_memory[i];
-    float v = v_memory[i];
+    float grad = (float)grads_memory[idx];
+    float m = m_memory[idx];
+    float v = v_memory[idx];
     // update the first moment (momentum)
     m = lerp(grad, m, beta1);
-    m_memory[i] = m;
+    m_memory[idx] = m;
     // update the second moment (RMSprop)
     v = lerp(grad * grad, v, beta2);
-    v_memory[i] = v;
+    v_memory[idx] = v;
     m /= beta1_correction;  // m_hat
     v /= beta2_correction;  // v_hat
     // fetch the old value of this parameter as a float, from either source
-    float old_param = (master_params_memory != NULL) ? master_params_memory[i] : (float)params_memory[i];
+    float old_param = (master_params_memory != NULL) ? master_params_memory[idx] : (float)params_memory[idx];
     // update this parameter
     float param = old_param - (learning_rate * (m / (sqrtf(v) + eps) + weight_decay * old_param));
     // update our low precision version of the parameters using stochastic rounding
     // this will be used in the next forward pass
     // TODO: simply doing `params_memory[i] = (floatX)param;` breaks everything (why?)
     unsigned int random = Get2dNoiseUint(threadIdx.x, blockIdx.x, seed);
-    stochastic_rounding(param, &params_memory[i], random);
+    stochastic_rounding(param, &params_memory[idx], random);
     // write the full, float version of the param into our master copy, if we maintain one
     // this will be used in the next update
-    if (master_params_memory != NULL) { master_params_memory[i] = param; }
+    if (master_params_memory != NULL) { master_params_memory[idx] = param; }
 }
 
 struct SoftmaxParams {
@@ -1378,14 +1276,14 @@ __global__ void fused_classifier_kernel3(floatX* logits, floatX* losses, floatX*
 
 __global__ void copy_and_cast_kernel(float* dst, const floatX* src, size_t n) {
     // a small kernel to copy and cast, i.e. `dst <- (float) src`
-    const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < n) { dst[i] = (float)src[i]; }
+    const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n) { dst[idx] = (float)src[idx]; }
 }
 
 __global__ void cast_and_add_kernel(floatX* dst, const float* src, size_t n) {
     // used only for matmul_backward_bias kernel, a little bit embarassing TODO delete later
-    const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < n) { dst[i] += (floatX)src[i]; } // have to += because dbias is a paramater
+    const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n) { dst[idx] += (floatX)src[idx]; } // have to += because dbias is a paramater
 }
 
 // ----------------------------------------------------------------------------
@@ -1404,12 +1302,12 @@ void encoder_forward(floatX* out,
 
 void encoder_backward(floatX* dwte, floatX* dwpe,
                     const floatX* dout, const int* inp,
-                    int B, int T, int C) {
+                    int B, int T, int C, uint seed) {
     NVTX_RANGE_FN();
     const int N = B * T * C;
     const int block_size = 256;
-    const int grid_size = CEIL_DIV(N, block_size);
-    encoder_backward_kernel<<<grid_size, block_size, 0, main_stream>>>(dwte, dwpe, dout, inp, B, T, C);
+    const int grid_size = CEIL_DIV(N, block_size * 2); // each thread handles 2 elements
+    encoder_backward_kernel<<<grid_size, block_size, 0, main_stream>>>(dwte, dwpe, dout, inp, B, T, C, seed);
     cudaCheck(cudaGetLastError());
 }
 
@@ -1424,9 +1322,7 @@ void layernorm_forward(floatX* out, floatX* mean, floatX* rstd,
     cudaCheck(cudaGetLastError());
 }
 
-// uses cuBLASLt to fuse the bias and gelu. does not work with OC = 50257 (last layer)
 // https://docs.nvidia.com/cuda/cublas/#cublasltmatmul
-// https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuBLASLt/LtSgemm/sample_cublasLt_LtSgemm.cu
 void matmul_forward_cublaslt(floatX* out,
                      floatX* inp, floatX* weight, floatX* bias,
                      int B, int T, int C, int OC) {
@@ -1439,13 +1335,8 @@ void matmul_forward_cublaslt(floatX* out,
         exit(EXIT_FAILURE);
     }
 
-    // FP16 alpha/beta need to be used if and only if CUBLAS_COMPUTE_16F
+    // these need to be in FP16 if and only if alpha/beta are CUBLAS_COMPUTE_16F
     const float alpha = 1.0f, beta = 0.0f;
-    const half alpha_fp16 = (half)alpha, beta_fp16 = (half)beta;
-    const void* alpha_ptr = (CUBLAS_LOWP_COMPUTE == CUBLAS_COMPUTE_16F) ?
-                            (const void*)&alpha_fp16 : (const void*)&alpha;
-    const void* beta_ptr =  (CUBLAS_LOWP_COMPUTE == CUBLAS_COMPUTE_16F) ?
-                            (const void*)&beta_fp16 : (const void*)&beta;
 
     int returnedResults = 0;
     cublasLtMatmulDesc_t operationDesc;
@@ -1459,16 +1350,12 @@ void matmul_forward_cublaslt(floatX* out,
     // create the operation descriptor
     cublasOperation_t opNoTranspose = CUBLAS_OP_N;
     cublasOperation_t opTranspose = CUBLAS_OP_T;
-    cublasLtEpilogue_t epilogueBias = CUBLASLT_EPILOGUE_BIAS;
+    cublasLtEpilogue_t epilogueBias = has_bias ? CUBLASLT_EPILOGUE_BIAS : CUBLASLT_EPILOGUE_DEFAULT;
 
-    cudaDataType_t scale_type = (CUBLAS_LOWP_COMPUTE == CUBLAS_COMPUTE_16F) ? CUDA_R_16F : CUDA_R_32F;
-    cublasCheck(cublasLtMatmulDescCreate(&operationDesc, CUBLAS_LOWP_COMPUTE, scale_type));
+    cublasCheck(cublasLtMatmulDescCreate(&operationDesc, cublas_compute, CUDA_R_32F)); // FP16 if CUBLAS_COMPUTE_16F
     cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &opTranspose, sizeof(opTranspose)));
     cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opNoTranspose, sizeof(opNoTranspose)));
-    if(has_bias) {
-        cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogueBias,
-                                                   sizeof(epilogueBias)));
-    }
+    cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogueBias, sizeof(epilogueBias)));
     cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias)));
 
     // define matrix layouts
@@ -1480,8 +1367,7 @@ void matmul_forward_cublaslt(floatX* out,
     // create a preference handle with specified max workspace
     cublasCheck(cublasLtMatmulPreferenceCreate(&preference));
     cublasCheck(cublasLtMatmulPreferenceSetAttribute(preference,
-        CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
-        &cublaslt_workspace_size, sizeof(cublaslt_workspace_size)));
+        CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &cublaslt_workspace_size, sizeof(cublaslt_workspace_size)));
 
     // find a suitable algorithm
     cublasCheck(cublasLtMatmulAlgoGetHeuristic(cublaslt_handle, operationDesc,
@@ -1494,7 +1380,7 @@ void matmul_forward_cublaslt(floatX* out,
 
     // call the matmul
     cublasCheck(cublasLtMatmul(cublaslt_handle, operationDesc,
-        alpha_ptr, weight, weightLayout, inp, inputLayout, beta_ptr,
+        &alpha, weight, weightLayout, inp, inputLayout, &beta,
         out, outputLayout, out, outputLayout, &heuristic.algo,
         cublaslt_workspace, cublaslt_workspace_size, main_stream));
 
@@ -1514,7 +1400,7 @@ void attention_forward(floatX* out, floatX* qkvr, floatX* att,
     // Note: `inp` is not needed for backward pass, so we re-use it as a scratch buffer.
     // Its contents will be overwritten by this function.
     const int block_size = 256;
-    const int softmax_block_size = 256;
+    const float alpha = 1.0f, beta = 0.0f;
 
     // inp is (B, T, 3C) QKV
     // preatt, att are (B, NH, T, T)
@@ -1529,50 +1415,32 @@ void attention_forward(floatX* out, floatX* qkvr, floatX* att,
     int total_threads = B * NH * T * HS;
     int num_blocks = CEIL_DIV(total_threads, block_size);
     permute_kernel<<<num_blocks, block_size, 0, main_stream>>>(q, k, v, inp, B, T, NH, HS);
-    cudaCheck(cudaGetLastError());
 
-    // IMPORTANT: alpha/beta are FP32 for CUBLAS_COMPUTE_32F even if FP16 inputs/outputs
-    // But need FP16 scale for CUBLAS_COMPUTE_16F (no errors otherwise, just garbage results *sigh*)
-    const float alpha = 1.0f;
-    const float beta = 0.0f;
-    const floatX alpha_lowp = (floatX)alpha;
-    const floatX beta_lowp = (floatX)beta;
-    void* alpha_ptr = (CUBLAS_LOWP_COMPUTE == CUBLAS_COMPUTE_16F) ? (void*)&alpha_lowp : (void*)&alpha;
-    void* beta_ptr = (CUBLAS_LOWP_COMPUTE == CUBLAS_COMPUTE_16F) ? (void*)&beta_lowp : (void*)&beta;
 
     floatX* preatt = inp;
     cublasCheck(cublasGemmStridedBatchedEx(cublas_handle,
                                      CUBLAS_OP_T, CUBLAS_OP_N,
-                                     T, T, HS,
-                                     alpha_ptr,
+                                     T, T, HS, &alpha,
                                      k, CUBLAS_LOWP, HS, T * HS,
                                      q, CUBLAS_LOWP, HS, T * HS,
-                                     beta_ptr,
-                                     preatt, CUBLAS_LOWP, T, T * T,
-                                     B * NH,
-                                     CUBLAS_LOWP_COMPUTE,
-                                     CUBLAS_GEMM_DEFAULT));
+                                     &beta, preatt, CUBLAS_LOWP, T, T * T,
+                                     B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT));
 
     // multiply all elements of preatt elementwise by scale
     float scale = 1.0 / sqrtf(HS);
-    int grid_size = CEIL_DIV(B * NH * T * 32, softmax_block_size);
-    softmax_forward_kernel5<<<grid_size, softmax_block_size, 0, main_stream>>>(att, scale, preatt, B * NH, T);
-    cudaCheck(cudaGetLastError());
+    int grid_size = CEIL_DIV(B * NH * T * 32, block_size);
+    softmax_forward_kernel5<<<grid_size, block_size, 0, main_stream>>>(att, scale, preatt, B * NH, T);
 
     // new approach: first cuBLAS another batched matmul
     floatX* vaccum = inp;
     // y = att @ v # (B, nh, T, T) @ (B, nh, T, hs) -> (B, nh, T, hs)
     cublasCheck(cublasGemmStridedBatchedEx(cublas_handle,
                                      CUBLAS_OP_N, CUBLAS_OP_N,
-                                     HS, T, T,
-                                     alpha_ptr,
+                                     HS, T, T, &alpha,
                                      v, CUBLAS_LOWP, HS, T * HS,
                                      att, CUBLAS_LOWP, T, T * T,
-                                     beta_ptr,
-                                     vaccum, CUBLAS_LOWP, HS, T * HS,
-                                     B * NH,
-                                     CUBLAS_LOWP_COMPUTE,
-                                     CUBLAS_GEMM_DEFAULT));
+                                     &beta, vaccum, CUBLAS_LOWP, HS, T * HS,
+                                     B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT));
 
     // now unpermute
     // y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
@@ -1610,8 +1478,7 @@ void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias,
                      float* dbias_buffer,
                      int B, int T, int C, int OC) {
     NVTX_RANGE_FN();
-    float one = 1.0f;
-    float zero = 0.0f;
+    float one = 1.0f, zero = 0.0f;
 
     // backward to bias, if given, does a +=
     if (dbias != NULL) {
@@ -1620,14 +1487,15 @@ void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias,
         // blockDim.x is 32 --> single warp being responsible for those 256 OCs
         // blockDim.y is 16 --> 16 parallel independent warps processing the same OCs for different BTs
         // gridDim.x is OC / 256 --> each block processes 256 OCs
-        // grimDim.y is max(1, (cuda_num_SMs * cuda_threads_per_SM) / (512 * gridDim.x)); --> fill up the entire GPU!
+        // grimDim.y is max(1, (cuda_num_SMs * threads_per_SM) / (512 * gridDim.x)); --> fill up the entire GPU!
         const int warp_size = 32;
         const int block_size = 512;
         const int OC_per_warp = warp_size * x128::size; // 256 at BF16
         const int block_size_x = 32;
         const int block_size_y = block_size / block_size_x; // 16
         const int grid_size_x = OC / OC_per_warp; // e.g. 3 horizontal blocks for 768 OCs at BF16
-        const int grid_size_y = max(1, cuda_threads_per_SM * cuda_num_SMs / (block_size * grid_size_x)); // full GPU!
+        const int grid_size_y = max(1, deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount
+                                     / (block_size * grid_size_x)); // full GPU!
 
         assert((OC % OC_per_warp) == 0); // there is no bounds checking in the kernel to maximise performance
 
@@ -1636,17 +1504,17 @@ void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias,
                                        dim3(block_size_x, block_size_y),
                                        OC_per_warp * sizeof(float), main_stream>>>(dbias_buffer, dout, B, T, OC);
         cast_and_add_kernel<<<CEIL_DIV(OC, 256), 256, 0, main_stream>>>(dbias, dbias_buffer, OC);
-        cudaCheck(cudaGetLastError());
     }
 
     // backward to input, uses = in the backward pass (set the gradient)
     cublasCheck(cublasGemmEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, C, B*T, OC, &one,
                              weight, CUBLAS_LOWP, C, dout, CUBLAS_LOWP, OC, &zero,
-                             dinp, CUBLAS_LOWP, C, CUBLAS_LOWP_COMPUTE, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    // backward to weight, uses += in the backward pass (accumulate the gradient)
+                             dinp, CUBLAS_LOWP, C, cublas_compute, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    // backward to weight, uses += in the backward pass (accumulate the gradient) by setting alpha=one
     cublasCheck(cublasGemmEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_T, C, OC, B*T, &one,
                              inp, CUBLAS_LOWP, C, dout, CUBLAS_LOWP, OC, &one,
-                             dweight, CUBLAS_LOWP, C, CUBLAS_LOWP_COMPUTE, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+                             dweight, CUBLAS_LOWP, C, cublas_compute, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    cudaCheck(cudaGetLastError());
 }
 
 void layernorm_backward(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
@@ -1654,11 +1522,10 @@ void layernorm_backward(floatX* dinp, floatX* dweight, floatX* dbias, float* scr
                         int B, int T, int C) {
     NVTX_RANGE_FN();
     const int block_size = 1024;
-    const int grid_size = 1 * cuda_num_SMs;
+    const int grid_size = deviceProp.multiProcessorCount;
     size_t shared_mem_size = (2 * C + 1) * sizeof(float);
 
     cudaMemsetAsync(scratch, 0, (2 * C + 1) * sizeof(float), main_stream);
-
     layernorm_backward_kernel7<<<grid_size, block_size, shared_mem_size, main_stream>>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C);
     cudaCheck(cudaGetLastError());
 }
@@ -1672,14 +1539,7 @@ void attention_backward(floatX* dinp, floatX* dqkvr, floatX* dpreatt, floatX* da
     NVTX_RANGE_FN();
     const int block_size = 256;
     int HS = C / NH; // head size
-
-    // FP16 alpha/beta need to be used if and only if CUBLAS_COMPUTE_16F
     const float alpha = 1.0f, beta = 0.0f;
-    const half alpha_fp16 = (half)alpha, beta_fp16 = (half)beta;
-    const void* alpha_ptr = (CUBLAS_LOWP_COMPUTE == CUBLAS_COMPUTE_16F) ?
-                            (const void*)&alpha_fp16 : (const void*)&alpha;
-    const void* beta_ptr =  (CUBLAS_LOWP_COMPUTE == CUBLAS_COMPUTE_16F) ?
-                            (const void*)&beta_fp16 : (const void*)&beta;
 
     // unpack convenience pointers into q, k, v
     const floatX *q, *k, *v;
@@ -1694,31 +1554,26 @@ void attention_backward(floatX* dinp, floatX* dqkvr, floatX* dpreatt, floatX* da
     // backward through the unpermute operation
     int num_blocks = CEIL_DIV(B * T * C, block_size);
     unpermute_kernel_backward<<<num_blocks, block_size, 0, main_stream>>>(scratch, dout, B, T, NH, HS);
-    cudaCheck(cudaGetLastError());
     // backward into datt
-
-    cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, T, T, HS, alpha_ptr,
-                                           v, CUBLAS_LOWP, HS, T * HS, scratch, CUBLAS_LOWP, HS, T * HS, beta_ptr,
-                                           datt, CUBLAS_LOWP, T, T * T, B * NH, CUBLAS_LOWP_COMPUTE, CUBLAS_GEMM_DEFAULT));
-
+    cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, T, T, HS, &alpha,
+                                           v, CUBLAS_LOWP, HS, T * HS, scratch, CUBLAS_LOWP, HS, T * HS, &beta,
+                                           datt, CUBLAS_LOWP, T, T * T, B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT));
     // backward into dv
-    cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_T, HS, T, T, alpha_ptr,
-                                           scratch, CUBLAS_LOWP, HS, T * HS, att, CUBLAS_LOWP, T, T * T, beta_ptr,
-                                           dv, CUBLAS_LOWP, HS, T * HS, B * NH, CUBLAS_LOWP_COMPUTE, CUBLAS_GEMM_DEFAULT));
-
+    cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_T, HS, T, T, &alpha,
+                                           scratch, CUBLAS_LOWP, HS, T * HS, att, CUBLAS_LOWP, T, T * T, &beta,
+                                           dv, CUBLAS_LOWP, HS, T * HS, B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT));
     // backward into preatt
     int hs = C / NH; // head size
     float scale = 1.0f / sqrtf(hs);
     softmax_autoregressive_backward_kernel<<<dim3(T / 4, B * NH), 256, 256, main_stream>>>(dpreatt, datt, att, B, T, C, scale);
-    cudaCheck(cudaGetLastError());
     // backward into q
-    cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, HS, T, T, alpha_ptr,
-                                           k, CUBLAS_LOWP, HS, T * HS, dpreatt, CUBLAS_LOWP, T, T * T, beta_ptr,
-                                           dq, CUBLAS_LOWP, HS, T * HS, B * NH, CUBLAS_LOWP_COMPUTE, CUBLAS_GEMM_DEFAULT));
+    cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, HS, T, T, &alpha,
+                                           k, CUBLAS_LOWP, HS, T * HS, dpreatt, CUBLAS_LOWP, T, T * T, &beta,
+                                           dq, CUBLAS_LOWP, HS, T * HS, B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT));
     // backward into k
-    cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_T, HS, T, T, alpha_ptr,
-                                           q, CUBLAS_LOWP, HS, T * HS, dpreatt, CUBLAS_LOWP, T, T * T, beta_ptr,
-                                           dk, CUBLAS_LOWP, HS, T * HS, B * NH, CUBLAS_LOWP_COMPUTE, CUBLAS_GEMM_DEFAULT));
+    cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_T, HS, T, T, &alpha,
+                                           q, CUBLAS_LOWP, HS, T * HS, dpreatt, CUBLAS_LOWP, T, T * T, &beta,
+                                           dk, CUBLAS_LOWP, HS, T * HS, B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT));
     // backward into inp
     num_blocks = CEIL_DIV(B * NH * T * HS, block_size);
     permute_kernel_backward<<<num_blocks, block_size, 0, main_stream>>>(dinp, dq, dk, dv, B, T, NH, HS);
@@ -1814,9 +1669,9 @@ void* malloc_and_point_parameters(ParameterTensors* params, size_t* param_elemen
     cudaCheck(cudaMalloc((void**)&params_memory, num_parameters_bytes));
     // assign all the tensors their place in the array
     floatX** ptrs[] = {
-        &params->wte, &params->wpe, (floatX**)&params->ln1w, (floatX**)&params->ln1b, &params->qkvw, &params->qkvb,
-        &params->attprojw, &params->attprojb, (floatX**)&params->ln2w, (floatX**)&params->ln2b, &params->fcw, &params->fcb,
-        &params->fcprojw, &params->fcprojb, (floatX**)&params->lnfw, (floatX**)&params->lnfb
+        &params->wte, &params->wpe, &params->ln1w, &params->ln1b, &params->qkvw, &params->qkvb,
+        &params->attprojw, &params->attprojb, &params->ln2w, &params->ln2b, &params->fcw, &params->fcb,
+        &params->fcprojw, &params->fcprojb, &params->lnfw, &params->lnfb
     };
     char* params_memory_iterator = (char*)params_memory;
     for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
@@ -1826,7 +1681,7 @@ void* malloc_and_point_parameters(ParameterTensors* params, size_t* param_elemen
     return params_memory;
 }
 
-#define NUM_ACTIVATION_TENSORS 21
+#define NUM_ACTIVATION_TENSORS 20
 typedef struct {
     floatX* encoded; // (B, T, C)
     floatX* ln1; // (L, B, T, C)
@@ -1846,7 +1701,6 @@ typedef struct {
     floatX* lnf; // (B, T, C)
     floatX* lnf_mean; // (B, T)
     floatX* lnf_rstd; // (B, T)
-    floatX* losses; // (B, T) // todo - no longer used as GPU writes directly to cpu_losses
     // adding these two compared to the CPU .c code, needed for attention kernel as buffers
     floatX* qkvr; // (L, B, T, 3*C)
     // in inference mode, this buffer will store the logits
@@ -1885,9 +1739,8 @@ void fill_in_activation_sizes(size_t* act_sizes, size_t B, size_t T, GPT2Config
     act_sizes[15] = B * T * C; // lnf
     act_sizes[16] = B * T; // lnf_mean
     act_sizes[17] = B * T; // lnf_rstd
-    act_sizes[18] = B * T; // losses
-    act_sizes[19] = L * B * T * 3*C; // qkvr
-    act_sizes[20] = B * T * max(3*C, max(NH*T, Vp)); // output / scratch
+    act_sizes[18] = L * B * T * 3*C; // qkvr
+    act_sizes[19] = B * T * max(3*C, max(NH*T, Vp)); // output / scratch
 }
 
 // Backward pass is conceptually quite different from forward, because we can discard
@@ -1938,7 +1791,7 @@ void* malloc_and_point_activations(ActivationTensors* acts, const size_t* act_si
         &acts->encoded, &acts->ln1, &acts->ln1_mean, &acts->ln1_rstd, &acts->atty,
         &acts->att, &acts->attproj, &acts->residual2, &acts->ln2, &acts->ln2_mean,
         &acts->ln2_rstd, &acts->fch, &acts->fch_gelu, &acts->fcproj, &acts->residual3, &acts->lnf,
-        &acts->lnf_mean, &acts->lnf_rstd, &acts->losses, &acts->qkvr, &acts->output
+        &acts->lnf_mean, &acts->lnf_rstd, &acts->qkvr, &acts->output
     };
     return malloc_and_point(ptrs, act_sizes, NUM_ACTIVATION_TENSORS);
 }
@@ -2358,7 +2211,7 @@ void gpt2_backward(GPT2 *model) {
         // layernorm backward does += to dresidual, so it correctly accumulates gradient for the Attention block above
         layernorm_backward(dresidual, dl_ln1w, dl_ln1b, scratchF, dl_btc, residual, l_ln1w, l_ln1_mean, l_ln1_rstd, B, T, C);
     }
-    encoder_backward(grads.wte, grads.wpe, dresidual, model->inputs, B, T, C);
+    encoder_backward(grads.wte, grads.wpe, dresidual, model->inputs, B, T, C, random_u32(&model->rng_state));
 
     // accumulate the loss, this was calculated at the end of gpt2_forward()
     cudaCheck(cudaEventSynchronize(loss_event)); // hopefully finished long ago
@@ -2441,9 +2294,61 @@ void gpt2_free(GPT2 *model) {
     cudaFreeHost(model->cpu_losses);
 }
 
+// ----------------------------------------------------------------------------
+// common init & free code for train/test/profile
+void common_start(bool override_enable_tf32 = true) {
+    int deviceIdx = 0;
+    cudaCheck(cudaSetDevice(deviceIdx));
+    cudaGetDeviceProperties(&deviceProp, deviceIdx);
+    printf("[System]\n");
+    printf("Device %d: %s\n", deviceIdx, deviceProp.name);
+
+    cudaCheck(cudaStreamCreate(&main_stream));
+    cudaEventCreateWithFlags(&main_event, cudaEventDisableTiming);
+    cudaEventCreateWithFlags(&loss_event, cudaEventDisableTiming);
+    for (int i = 0; i < num_parallel_streams; i++) {
+        cudaCheck(cudaStreamCreate(&parallel_streams[i]));
+        cudaEventCreateWithFlags(&parallel_events[i], cudaEventDisableTiming);
+    }
+
+    // set up cuBLAS and cuBLASLt (and cuDNN if enabled)
+    cublasCheck(cublasCreate(&cublas_handle));
+    cublasCheck(cublasSetStream(cublas_handle, main_stream));
+    cublasCheck(cublasLtCreate(&cublaslt_handle));
+    cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size));
+    #ifdef ENABLE_CUDNN
+    checkCudnnErr(cudnnCreate(&cudnn_handle));
+    #endif
+    // TF32 precision is equivalent to torch.set_float32_matmul_precision('high')
+    bool enable_tf32 = PRECISION_MODE == PRECISION_FP32 && deviceProp.major >= 8 && override_enable_tf32;
+    cublasCheck(cublasSetMathMode(cublas_handle, enable_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH));
+    cublas_compute = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F;
+
+    // setup the (global) cuBLASLt workspace
+    cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size));
+}
+
+void common_free(GPT2 &model) {
+    cudaCheck(cudaEventDestroy(main_event));
+    cudaCheck(cudaEventDestroy(loss_event));
+    for (int i = 0; i < num_parallel_streams; i++) {
+        cudaCheck(cudaStreamDestroy(parallel_streams[i]));
+        cudaCheck(cudaEventDestroy(parallel_events[i]));
+    }
+    cudaCheck(cudaStreamDestroy(main_stream));
+
+    gpt2_free(&model);
+    #ifdef ENABLE_CUDNN
+    if (cudnn_workspace != NULL) { cudaCheck(cudaFree(cudnn_workspace)); }
+    checkCudnnErr(cudnnDestroy(cudnn_handle));
+    #endif
+    cudaCheck(cudaFree(cublaslt_workspace));
+    cublasCheck(cublasDestroy(cublas_handle));
+    cublasCheck(cublasLtDestroy(cublaslt_handle));
+}
+
 #ifndef TESTING
 // if we are TESTING (see test_gpt2.cu), we'll skip the int main below
-
 // ----------------------------------------------------------------------------
 // data loader lite: returns random batches of data from a file of integers
 
@@ -2656,45 +2561,14 @@ int main(int argc, char *argv[]) {
     printf0("| use_master_weights    | %-50s |\n", use_master_weights ? "enabled" : "disabled");
     printf0("+-----------------------+----------------------------------------------------+\n");
 
-    // set up the device
-    cudaCheck(cudaSetDevice(multi_gpu_config.local_device_idx));
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, multi_gpu_config.local_device_idx);
-    cuda_num_SMs = deviceProp.multiProcessorCount;
-    cuda_threads_per_SM = deviceProp.maxThreadsPerMultiProcessor;
-    cuda_arch_major = deviceProp.major;
-    cuda_arch_minor = deviceProp.minor;
+    common_start(override_enable_tf32); // common init code for train/test/profile
 
-    cudaCheck(cudaStreamCreate(&main_stream));
-    cudaEventCreateWithFlags(&main_event, cudaEventDisableTiming);
-    cudaEventCreateWithFlags(&loss_event, cudaEventDisableTiming);
-    for (int i = 0; i < num_parallel_streams; i++) {
-        cudaCheck(cudaStreamCreate(&parallel_streams[i]));
-        cudaEventCreateWithFlags(&parallel_events[i], cudaEventDisableTiming);
-    }
-
-    // set up cuBLAS and cuBLASLt
-    cublasCheck(cublasCreate(&cublas_handle));
-    cublasCheck(cublasSetStream(cublas_handle, main_stream));
-    cublasCheck(cublasLtCreate(&cublaslt_handle));
-    cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size));
-    // setup compute precision settings for cublas
-    // TF32 precision is equivalent to torch.set_float32_matmul_precision('high')
-    int enable_tf32 = cuda_arch_major >= 8 ? 1 : 0;
-    if (override_enable_tf32 == 0) { enable_tf32 = 0; } // force to zero via arg
-    cublas_compute_type = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F;
-    cublasMath_t cublas_math_mode = enable_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH;
-    cublasCheck(cublasSetMathMode(cublas_handle, cublas_math_mode));
-    if(cublas_compute_type); // unused in BF16 mode, avoid warning
-
-    // set up cuDNN
-    #ifdef ENABLE_CUDNN
-    checkCudnnErr(cudnnCreate(&cudnn_handle));
-    #endif
+    const char* precision_str = (PRECISION_MODE == PRECISION_FP32)
+                              ? (cublas_compute == CUBLAS_COMPUTE_32F_FAST_TF32 ? "TF32" : "FP32")
+                              : (PRECISION_MODE == PRECISION_FP16 ? "FP16" : "BF16");
 
     printf0("| device                | %-50s |\n", deviceProp.name);
-    printf0("| TF32                  | %-50s |\n", enable_tf32 ? "enabled" : "disabled");
-    printf0("| precision             | %-50s |\n", precision_mode_str);
+    printf0("| precision             | %-50s |\n", precision_str);
     printf0("+-----------------------+----------------------------------------------------+\n");
 
     // build the GPT-2 model from a checkpoint
@@ -2712,17 +2586,15 @@ int main(int argc, char *argv[]) {
     printf0("+-----------------------+----------------------------------------------------+\n");
 
     // build DataLoaders for both train and val
-    char train_tokens_filename[128];
-    char val_tokens_filename[128];
+    char train_tokens_filename[128], val_tokens_filename[128];
     assert(strlen(input_dataset_prefix) < 100); // being bit lazy here, make sure we don't overflow
     // if we're only overfitting a single batch for debugging, let's overfit the first batch
     // from val instead of train split, because val is smaller and a bit faster
     const char* train_split = (overfit_single_batch == 1) ? "val" : "train";
     sprintf(train_tokens_filename, "%s_%s.bin", input_dataset_prefix, train_split);
     sprintf(val_tokens_filename, "%s_val.bin", input_dataset_prefix);
-    DataLoader train_loader;
+    DataLoader train_loader, val_loader;
     dataloader_init(&train_loader, &multi_gpu_config, train_tokens_filename, B, T);
-    DataLoader val_loader;
     dataloader_init(&val_loader, &multi_gpu_config, val_tokens_filename, B, T);
     int train_num_batches = (max_steps == -1) ? train_loader.num_batches : max_steps; // default = 1 epoch
     int val_num_batches = train_loader.num_batches < val_max_batches ? train_loader.num_batches : val_max_batches;
@@ -2738,11 +2610,9 @@ int main(int argc, char *argv[]) {
     printf0("num_parameters: %zu ==> bytes: %zu\n", model.num_parameters, model.num_parameters_bytes);
     printf0("allocated %d MiB for model parameters\n", (int)round(model.num_parameters_bytes / (1024 * 1024)));
 
-    // set up the Logger
+    // set up the Logger & Tokenizer
     Logger logger;
     logger_init(&logger, output_log_file);
-
-    // build the Tokenizer
     Tokenizer tokenizer;
     tokenizer_init(&tokenizer, "gpt2_tokenizer.bin");
 
@@ -2870,20 +2740,13 @@ int main(int argc, char *argv[]) {
     dataloader_free(&train_loader);
     dataloader_free(&val_loader);
     tokenizer_free(&tokenizer);
-    gpt2_free(&model);
     free(cpu_logits_raw);
     free(cpu_logits);
     free(gen_tokens);
-    #ifdef ENABLE_CUDNN
-    if (cudnn_workspace != NULL) { cudaCheck(cudaFree(cudnn_workspace)); }
-    checkCudnnErr(cudnnDestroy(cudnn_handle));
-    #endif
-    cudaCheck(cudaFree(cublaslt_workspace));
-    cublasCheck(cublasDestroy(cublas_handle));
-    cublasCheck(cublasLtDestroy(cublaslt_handle));
     logger_free(&logger);
     multi_gpu_config_free(&multi_gpu_config);
 
+    common_free(model);
     return 0;
 }
 #endif

From abaaceb8011feb95ac402c0b5d152d3d65918f9a Mon Sep 17 00:00:00 2001
From: ademeure <arundemeure@gmail.com>
Date: Sat, 4 May 2024 23:43:13 +0100
Subject: [PATCH 010/172] Added makefile gencode changes

---
 Makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 06923801d..a755c309f 100644
--- a/Makefile
+++ b/Makefile
@@ -13,7 +13,9 @@ CUDA_OUTPUT_FILE = -o $@
 
 # NVCC flags
 # -t=0 is short for --threads, 0 = number of CPUs on the machine
-NVCC_FLAGS = -O3 -t=0 --use_fast_math
+# include PTX for both SM52 (Maxwell) and SM80 (Ampere, our main optimisation target at the moment) + native SASS for current GPU
+# this increases compile time by ~5%, but we need >=SM70 PTX for some optimisations, and it allows "cuobjdump --dump-sass" to work
+NVCC_FLAGS = -O3 -t=0 --use_fast_math -gencode=arch=compute_52,code=compute_52 -gencode=arch=compute_80,code=compute_80 -arch=native
 NVCC_LDFLAGS = -lcublas -lcublasLt
 NVCC_INCLUDES =
 NVCC_LDLIBS =

From 18d7ed92429756427b2af4483cc8f2203a4ebe35 Mon Sep 17 00:00:00 2001
From: ademeure <arundemeure@gmail.com>
Date: Sat, 4 May 2024 23:46:39 +0100
Subject: [PATCH 011/172] revert profile.py changes for now

---
 profile_gpt2cu.py | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/profile_gpt2cu.py b/profile_gpt2cu.py
index 8e15b7dc2..b3eec863a 100644
--- a/profile_gpt2cu.py
+++ b/profile_gpt2cu.py
@@ -31,7 +31,7 @@
     "dram__bytes_write.sum",                    # DRAM writes
     "lts__t_sectors_srcunit_tex_op_read.sum",   # L2 reads (sectors -- 32B)
     "lts__t_sectors_srcunit_tex_op_write.sum",  # L2 reads (sectors -- 32B)
-    "sm__pipe_tensor_op_hmma_cycles_active.avg.pct_of_peak_sustained_active", # todo - tensor core %
+    "smsp__inst_executed.sum",                   # instructions
 ]
 cmd = [NCU, "-i", "profile.ncu-rep", "--csv", "--page", "raw", "--metrics", ",".join(metrics)]
 result = subprocess.check_output(cmd, text=True).strip()
@@ -55,11 +55,11 @@
 for rid, row in enumerate(reader):
     if rid == 0:
         #  headings
-        print(f"id pass {'name':<70} {'time':>8} {'RAM BW':>8} {'RAM rd':>8} {'RAM wt':>8} {'L2 rd':>8} {'L2 wt':>8} {'inst':>8}")
+        print(f"id pass {'name':<40} {'time':>8} {'RAM rd':>8} {'RAM wt':>8} {'L2 rd':>8} {'L2 wt':>8} {'inst':>8}")
         continue
     if rid == 1:
         # units
-        units = f"        {'':<70} {'ms':>8} {'GB/s':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'MInst':>8}"
+        units = f"        {'':<40} {'ms':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'MInst':>8}"
         print(units)
         print("." * len(units))
         continue
@@ -74,7 +74,7 @@
     write = float(row[12])
     l2_read = float(row[14])
     l2_write = float(row[15])
-    inst = float(row[16])
+    inst = float(row[16]) / 1e6
 
     kid = rid - 2
 
@@ -118,21 +118,18 @@
     total['l2_write'] += l2_write
     total['inst'] += inst
 
-    dram_bw = (read + write) / (time / 1000.0);
-
-    print(f"{kid:02} {pass_name:4} {fn_name:<70} {time:8.2f} {dram_bw:8.1f} {read:8.2f} {write:8.2f} {l2_read:8.2f} {l2_write:8.2f} {inst:8.2f}")
+    print(f"{kid:02} {pass_name:4} {fn_name:<40} {time:8.2f} {read:8.2f} {write:8.2f} {l2_read:8.2f} {l2_write:8.2f} {inst:8.2f}")
 
 total_time = total['time']
-total_dram_bw = (total['read'] + total['write']) / (total_time / 1000.0);
 print("." * len(units))
-print(f"        {'Total':<70} {total['time']:8.2f} {total_dram_bw:8.1f} {total['read']:8.2f} {total['write']:8.2f} {total['l2_read']:8.2f} {total['l2_write']:8.2f} {total['inst']:8.2f}")
+print(f"        {'Total':<40} {total['time']:8.2f} {total['read']:8.2f} {total['write']:8.2f} {total['l2_read']:8.2f} {total['l2_write']:8.2f} {total['inst']:8.2f}")
 
 print()
 print("Kernel type summaries:")
-print(f"  {'name':<70} {'time':>6} {'frac':>6}")
+print(f"  {'name':<40} {'time':>6} {'frac':>6}")
 ordered = sorted(summaries.items(), key=lambda x: x[1], reverse=True)
 for entry, value in ordered:
-    print(f"  {entry:<70} {value:6.2f} {100*value / total_time:6.2f}%")
+    print(f"  {entry:<40} {value:6.2f} {100*value / total_time:6.2f}%")
 
 
 ts = total_time / 1000

From ec0ab2d2d0eb5b7be3a7fa4bb93953cf57a9e364 Mon Sep 17 00:00:00 2001
From: ademeure <arundemeure@gmail.com>
Date: Sun, 5 May 2024 00:00:58 +0100
Subject: [PATCH 012/172] Remove arch=native as it only available on recent
 CUDA versions

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index a755c309f..05a02c387 100644
--- a/Makefile
+++ b/Makefile
@@ -15,7 +15,7 @@ CUDA_OUTPUT_FILE = -o $@
 # -t=0 is short for --threads, 0 = number of CPUs on the machine
 # include PTX for both SM52 (Maxwell) and SM80 (Ampere, our main optimisation target at the moment) + native SASS for current GPU
 # this increases compile time by ~5%, but we need >=SM70 PTX for some optimisations, and it allows "cuobjdump --dump-sass" to work
-NVCC_FLAGS = -O3 -t=0 --use_fast_math -gencode=arch=compute_52,code=compute_52 -gencode=arch=compute_80,code=compute_80 -arch=native
+NVCC_FLAGS = -O3 -t=0 --use_fast_math -gencode=arch=compute_52,code=compute_52 -gencode=arch=compute_80,code=sm_80
 NVCC_LDFLAGS = -lcublas -lcublasLt
 NVCC_INCLUDES =
 NVCC_LDLIBS =

From 83ec4b8e701be6488ae632184731bd2f5e8af8bc Mon Sep 17 00:00:00 2001
From: ademeure <arundemeure@gmail.com>
Date: Sun, 5 May 2024 02:09:52 +0100
Subject: [PATCH 013/172] Slightly reduce lines of code in cudnn_att

---
 cudnn_att.cu | 164 +++++++++++++++++++++------------------------------
 1 file changed, 68 insertions(+), 96 deletions(-)

diff --git a/cudnn_att.cu b/cudnn_att.cu
index 2735bbd14..fdff63483 100644
--- a/cudnn_att.cu
+++ b/cudnn_att.cu
@@ -5,18 +5,22 @@
 #include <cudnn_frontend.h>
 #include <cuda_bf16.h>
 #include <nvtx3/nvToolsExt.h>
+namespace fe = cudnn_frontend;
 
 // Specific configurations based on the enabled precision
 #if defined(ENABLE_FP32)
 typedef float floatX;
-
+static_assert(false, "cuDNN is not supported in FP32 mode.")
 // use fp16 (note: this may require gradient scaler, currently not implemented!)
+
 #elif defined(ENABLE_FP16)
 typedef half floatX;
 #define CUBLAS_LOWP CUDA_R_16F
-
+#define CUDNN_16BIT fe::DataType_t::HALF
 #else // Default to bfloat16
+
 typedef __nv_bfloat16 floatX;
+#define CUDNN_16BIT fe::DataType_t::BFLOAT16
 #endif
 
 // CUDA error checking
@@ -30,28 +34,17 @@ static void cudaCheck(cudaError_t error, const char *file, int line) {
 #define cudaCheck(err) (cudaCheck(err, __FILE__, __LINE__))
 
 // Profiler utils
-namespace {
-    class NvtxRange {
-    public:
-        NvtxRange(const char* s) { nvtxRangePush(s); }
-
-        NvtxRange(const std::string& base_str, int number) {
-            std::string range_string = base_str + " " + std::to_string(number);
-            nvtxRangePush(range_string.c_str());
-        }
-
-        ~NvtxRange() { nvtxRangePop(); }
-    };
-}
+class NvtxRange {
+ public:
+    NvtxRange(const char* s) { nvtxRangePush(s); }
+    NvtxRange(const std::string& base_str, int number) {
+        std::string range_string = base_str + " " + std::to_string(number);
+        nvtxRangePush(range_string.c_str());
+    }
+    ~NvtxRange() { nvtxRangePop(); }
+};
 #define NVTX_RANGE_FN() NvtxRange nvtx_range(__FUNCTION__)
 
-namespace fe = cudnn_frontend;
-#if CUBLAS_LOWP == CUDA_R_16BF
-#define CUDNN_16BIT fe::DataType_t::BFLOAT16
-#else
-#define CUDNN_16BIT fe::DataType_t::HALF
-#endif
-
 static cudnnHandle_t cudnn_handle;
 static size_t cudnn_workspace_size = 0; // dynamically allocated as needed (up to 256MiB!)
 static void* cudnn_workspace = NULL;
@@ -99,28 +92,24 @@ auto lookup_cache_or_build_graph_fwd(Args... args) {
 
     auto graph = std::make_shared<fe::graph::Graph>();
     graph->set_io_data_type(CUDNN_16BIT)
-        .set_intermediate_data_type(fe::DataType_t::FLOAT)
-        .set_compute_data_type(fe::DataType_t::FLOAT);
+          .set_intermediate_data_type(fe::DataType_t::FLOAT)
+          .set_compute_data_type(fe::DataType_t::FLOAT);
 
     // QKV is (B, T, 3, NH, HS) which cuDNN can handle directly without an external permute
-    auto Q = graph->tensor(fe::graph::Tensor_attributes()
-                               .set_name("Q")
+    auto Q = graph->tensor(fe::graph::Tensor_attributes().set_name("Q")
                                .set_dim({B, H, T, HS})
                                .set_stride({3 * H * HS * T,  HS, 3 * H * HS, 1}));
-    auto K = graph->tensor(fe::graph::Tensor_attributes()
-                               .set_name("K")
+    auto K = graph->tensor(fe::graph::Tensor_attributes().set_name("K")
                                .set_dim({B, H, T, HS})
                                .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1}));
-    auto V = graph->tensor(fe::graph::Tensor_attributes()
-                               .set_name("V")
+    auto V = graph->tensor(fe::graph::Tensor_attributes().set_name("V")
                                .set_dim({B, H, T, HS})
                                .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1}));
-    auto attn_scale = graph->tensor(fe::graph::Tensor_attributes()
-                                        .set_name("attn_scale")
-                                        .set_dim({1, 1, 1, 1})
-                                        .set_stride({1, 1, 1, 1})
-                                        .set_is_pass_by_value(true)
-                                        .set_data_type(fe::DataType_t::FLOAT));
+    auto attn_scale = graph->tensor(fe::graph::Tensor_attributes().set_name("attn_scale")
+                                .set_dim({1, 1, 1, 1})
+                                .set_stride({1, 1, 1, 1})
+                                .set_is_pass_by_value(true)
+                                .set_data_type(fe::DataType_t::FLOAT));
 
     auto sdpa_options = fe::graph::SDPA_attributes().set_name("flash_attention");
     sdpa_options.set_is_inference(is_inference_only);
@@ -136,8 +125,8 @@ auto lookup_cache_or_build_graph_fwd(Args... args) {
     assert(stats == nullptr || is_inference_only == false);
     if (is_inference_only == false) {
         stats->set_output(true).set_data_type(fe::DataType_t::FLOAT)
-            .set_dim({B, H, T, 1})
-            .set_stride({H * T, T, 1, 1});
+                               .set_dim({B, H, T, 1})
+                               .set_stride({H * T, T, 1, 1});
     }
 
     checkCudnnFE(graph->validate());
@@ -152,6 +141,7 @@ auto lookup_cache_or_build_graph_fwd(Args... args) {
     auto plans = graph->create_execution_plans({fe::HeurMode_t::A});
     checkCudnnFE(graph->check_support(cudnn_handle));
     checkCudnnFE(graph->build_plans(cudnn_handle));
+    assert(graph->get_workspace_size() <= cudnn_workspace_size); // fwd shouldn't need workspace
 
     auto tuple = std::make_tuple(graph, Q, K, V, attn_scale, O, stats);
     user_maintained_cache_fwd.insert({key, tuple});
@@ -165,47 +155,39 @@ auto lookup_cache_or_build_graph_bwd(Args... args) {
 
     auto graph = std::make_shared<fe::graph::Graph>();
     graph->set_io_data_type(CUDNN_16BIT)
-        .set_intermediate_data_type(fe::DataType_t::FLOAT)
-        .set_compute_data_type(fe::DataType_t::FLOAT);
+          .set_intermediate_data_type(fe::DataType_t::FLOAT)
+          .set_compute_data_type(fe::DataType_t::FLOAT);
 
     // (B, N, 3, NH, HS)
     // must come from inp (which means we also need to convert THAT to FP16)
-    auto Q = graph->tensor(fe::graph::Tensor_attributes()
-                               .set_name("Q")
-                               .set_dim({B, NH, T, HS})
-                               .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}));
-    auto K = graph->tensor(fe::graph::Tensor_attributes()
-                               .set_name("K")
-                               .set_dim({B, NH, T, HS})
-                               .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}));
-    auto V = graph->tensor(fe::graph::Tensor_attributes()
-                               .set_name("V")
-                               .set_dim({B, NH, T, HS})
-                               .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}));
-    auto O = graph->tensor(fe::graph::Tensor_attributes()
-                               .set_name("O")
-                               .set_dim({B, NH, T, HS})
-                               .set_stride({NH * HS * T, HS, NH * HS, 1}));
-    auto dO = graph->tensor(fe::graph::Tensor_attributes()
-                                .set_name("dO")
-                                .set_dim({B, NH, T, HS})
-                                .set_stride({NH * HS * T, HS, NH * HS, 1}));
-
-    auto stats = graph->tensor(fe::graph::Tensor_attributes()
-                                   .set_name("stats")
-                                   .set_dim({B, NH, T, 1})
-                                   .set_stride({NH * T, T, 1, 1})
-                                   .set_data_type(fe::DataType_t::FLOAT));
-    auto attn_scale = graph->tensor(fe::graph::Tensor_attributes()
-                                        .set_name("attn_scale")
-                                        .set_dim({1, 1, 1, 1})
-                                        .set_stride({1, 1, 1, 1})
-                                        .set_is_pass_by_value(true)
-                                        .set_data_type(fe::DataType_t::FLOAT));
-    auto sdpa_backward_options = fe::graph::SDPA_backward_attributes()
-        .set_name("flash_attention_backward")
-        .set_causal_mask(true)
-        .set_attn_scale(attn_scale);
+    auto Q = graph->tensor(fe::graph::Tensor_attributes().set_name("Q")
+                            .set_dim({B, NH, T, HS})
+                            .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}));
+    auto K = graph->tensor(fe::graph::Tensor_attributes().set_name("K")
+                            .set_dim({B, NH, T, HS})
+                            .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}));
+    auto V = graph->tensor(fe::graph::Tensor_attributes().set_name("V")
+                            .set_dim({B, NH, T, HS})
+                            .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}));
+    auto O = graph->tensor(fe::graph::Tensor_attributes().set_name("O")
+                            .set_dim({B, NH, T, HS})
+                            .set_stride({NH * HS * T, HS, NH * HS, 1}));
+    auto dO = graph->tensor(fe::graph::Tensor_attributes().set_name("dO")
+                            .set_dim({B, NH, T, HS})
+                            .set_stride({NH * HS * T, HS, NH * HS, 1}));
+
+    auto stats = graph->tensor(fe::graph::Tensor_attributes().set_name("stats")
+                            .set_dim({B, NH, T, 1})
+                            .set_stride({NH * T, T, 1, 1})
+                            .set_data_type(fe::DataType_t::FLOAT));
+    auto attn_scale = graph->tensor(fe::graph::Tensor_attributes().set_name("attn_scale")
+                            .set_dim({1, 1, 1, 1})
+                            .set_stride({1, 1, 1, 1})
+                            .set_is_pass_by_value(true)
+                            .set_data_type(fe::DataType_t::FLOAT));
+    auto sdpa_backward_options = fe::graph::SDPA_backward_attributes().set_name("flash_attention_backward")
+                            .set_causal_mask(true)
+                            .set_attn_scale(attn_scale);
 
     // Create the graph operation and get the output tensors back
     auto [dQ, dK, dV] = graph->sdpa_backward(Q, K, V, O, dO, stats, sdpa_backward_options);
@@ -227,6 +209,16 @@ auto lookup_cache_or_build_graph_bwd(Args... args) {
     checkCudnnFE(graph->check_support(cudnn_handle));
     checkCudnnFE(graph->build_plans(cudnn_handle));
 
+    // Reallocate the workspace if the required size is greater than the current workspace
+    // By default, cuDNN uses up to 256MiB of workspace, so we don't want to just allocate the maximum
+    if (graph->get_workspace_size() > cudnn_workspace_size) {
+        if (cudnn_workspace_size > 0) {
+            cudaCheck(cudaFree(cudnn_workspace));
+        }
+        cudnn_workspace_size = graph->get_workspace_size();
+        cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size));
+    }
+
     auto tuple = std::make_tuple(graph, Q, K, V, O, dO, stats, attn_scale, dQ, dK, dV);
     user_maintained_cache_bwd.insert({key, tuple});
     return tuple;
@@ -260,16 +252,6 @@ void attention_forward_cudnn(floatX* out,  // output: (B, T, NH, HS)
         variant_pack[softmax_stats] = stats;
     }
 
-    // Reallocate the workspace if the required size is greater than the current workspace
-    // By default, cuDNN uses up to 256MiB of workspace, so we don't want to just allocate the maximum
-    if (graph->get_workspace_size() > cudnn_workspace_size) {
-        if (cudnn_workspace_size > 0) {
-            cudaCheck(cudaFree(cudnn_workspace));
-        }
-        cudnn_workspace_size = graph->get_workspace_size();
-        cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size));
-    }
-
     // Execute graph
     checkCudnnFE(graph->execute(cudnn_handle, variant_pack, cudnn_workspace));
     cudaCheck(cudaGetLastError());
@@ -304,16 +286,6 @@ void attention_backward_cudnn(floatX* dqkvr,
         {dQ, devPtrdQ}, {dK, devPtrdK}, {dV, devPtrdV},
         {attn_scale, &attn_scale_cpu}};
 
-    // Reallocate the workspace if the required size is greater than the current workspace
-    // By default, cuDNN uses up to 256MiB of workspace, so we don't want to just allocate the maximum
-    if (graph->get_workspace_size() > cudnn_workspace_size) {
-        if (cudnn_workspace_size > 0) {
-            cudaCheck(cudaFree(cudnn_workspace));
-        }
-        cudnn_workspace_size = graph->get_workspace_size();
-        cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size));
-    }
-
     // Execute graph
     checkCudnnFE(graph->execute(cudnn_handle, variant_pack, cudnn_workspace));
     cudaCheck(cudaGetLastError());

From 8675104b73a04b8fcaec393a5e5be357713c5b45 Mon Sep 17 00:00:00 2001
From: ademeure <arundemeure@gmail.com>
Date: Sun, 5 May 2024 02:24:31 +0100
Subject: [PATCH 014/172] Compile for the user's GPU architecture using
 nvidia-smi query on Linux

---
 Makefile     | 11 ++++++++---
 cudnn_att.cu |  4 ++--
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index 97e67b97c..efdf4ddf3 100644
--- a/Makefile
+++ b/Makefile
@@ -13,9 +13,7 @@ CUDA_OUTPUT_FILE = -o $@
 
 # NVCC flags
 # -t=0 is short for --threads, 0 = number of CPUs on the machine
-# include PTX for both SM52 (Maxwell) and SM80 (Ampere, our main optimisation target at the moment) + native SASS for current GPU
-# this increases compile time by ~5%, but we need >=SM70 PTX for some optimisations, and it allows "cuobjdump --dump-sass" to work
-NVCC_FLAGS = -O3 -t=0 --use_fast_math -gencode=arch=compute_52,code=compute_52 -gencode=arch=compute_80,code=sm_80
+NVCC_FLAGS = -O3 -t=0 --use_fast_math
 NVCC_LDFLAGS = -lcublas -lcublasLt
 NVCC_INCLUDES =
 NVCC_LDLIBS =
@@ -24,6 +22,13 @@ NVCC_CUDNN =
 # overridable flag for multi-GPU training. by default we won't build with cudnn
 # because it bloats up the compile time from a few seconds to ~minute
 USE_CUDNN ?= 0
+# on linux, try to use nvidia-smi to detect the user's GPU and compile for that specific architecture
+ifeq ($(SHELL_UNAME), Linux)
+    NVCC_ARCH := $(shell which nvidia-smi > /dev/null 2>&1 && nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits | head -n 1 | sed 's/\.//g')
+    ifdef NVCC_ARCH
+        NVCC_FLAGS += -gencode arch=compute_$(NVCC_ARCH),code=sm_$(NVCC_ARCH)
+    endif
+endif
 
 # autodect a lot of various supports on current platform
 $(info ---------------------------------------------)
diff --git a/cudnn_att.cu b/cudnn_att.cu
index fdff63483..398e4e4c3 100644
--- a/cudnn_att.cu
+++ b/cudnn_att.cu
@@ -11,14 +11,14 @@ namespace fe = cudnn_frontend;
 #if defined(ENABLE_FP32)
 typedef float floatX;
 static_assert(false, "cuDNN is not supported in FP32 mode.")
-// use fp16 (note: this may require gradient scaler, currently not implemented!)
 
+// use fp16 (note: this may require gradient scaler, currently not implemented!)
 #elif defined(ENABLE_FP16)
 typedef half floatX;
 #define CUBLAS_LOWP CUDA_R_16F
 #define CUDNN_16BIT fe::DataType_t::HALF
-#else // Default to bfloat16
 
+#else // Default to bfloat16
 typedef __nv_bfloat16 floatX;
 #define CUDNN_16BIT fe::DataType_t::BFLOAT16
 #endif

From 7789738879c214e51227ad633ecaddb7b2405d8f Mon Sep 17 00:00:00 2001
From: ademeure <arundemeure@gmail.com>
Date: Sun, 5 May 2024 02:44:24 +0100
Subject: [PATCH 015/172] Add PTX back to binary + fix whitespaces

---
 Makefile      | 2 +-
 cudnn_att.cu  | 8 ++++----
 train_gpt2.cu | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index efdf4ddf3..583f86366 100644
--- a/Makefile
+++ b/Makefile
@@ -26,7 +26,7 @@ USE_CUDNN ?= 0
 ifeq ($(SHELL_UNAME), Linux)
     NVCC_ARCH := $(shell which nvidia-smi > /dev/null 2>&1 && nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits | head -n 1 | sed 's/\.//g')
     ifdef NVCC_ARCH
-        NVCC_FLAGS += -gencode arch=compute_$(NVCC_ARCH),code=sm_$(NVCC_ARCH)
+        NVCC_FLAGS += -gencode arch=compute_$(NVCC_ARCH),code=compute_$(NVCC_ARCH),code=sm_$(NVCC_ARCH)
     endif
 endif
 
diff --git a/cudnn_att.cu b/cudnn_att.cu
index 398e4e4c3..4664d1827 100644
--- a/cudnn_att.cu
+++ b/cudnn_att.cu
@@ -106,10 +106,10 @@ auto lookup_cache_or_build_graph_fwd(Args... args) {
                                .set_dim({B, H, T, HS})
                                .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1}));
     auto attn_scale = graph->tensor(fe::graph::Tensor_attributes().set_name("attn_scale")
-                                .set_dim({1, 1, 1, 1})
-                                .set_stride({1, 1, 1, 1})
-                                .set_is_pass_by_value(true)
-                                .set_data_type(fe::DataType_t::FLOAT));
+                               .set_dim({1, 1, 1, 1})
+                               .set_stride({1, 1, 1, 1})
+                               .set_is_pass_by_value(true)
+                               .set_data_type(fe::DataType_t::FLOAT));
 
     auto sdpa_options = fe::graph::SDPA_attributes().set_name("flash_attention");
     sdpa_options.set_is_inference(is_inference_only);
diff --git a/train_gpt2.cu b/train_gpt2.cu
index bc07ca1ff..dbc25677b 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2094,14 +2094,14 @@ void common_start(bool override_enable_tf32 = true) {
     cublasCheck(cublasSetStream(cublas_handle, main_stream));
     cublasCheck(cublasLtCreate(&cublaslt_handle));
     cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size));
-    
+
     // TF32 precision is equivalent to torch.set_float32_matmul_precision('high')
     bool enable_tf32 = PRECISION_MODE == PRECISION_FP32 && deviceProp.major >= 8 && override_enable_tf32;
     cublasCheck(cublasSetMathMode(cublas_handle, enable_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH));
     cublas_compute = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F;
     // setup the (global) cuBLASLt workspace
     cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size));
-    
+
     create_cudnn();
 }
 

From c15ca1f4cf1508421004020a694199b3eecaa891 Mon Sep 17 00:00:00 2001
From: ademeure <arundemeure@gmail.com>
Date: Sun, 5 May 2024 02:49:16 +0100
Subject: [PATCH 016/172] Fix makefile

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 583f86366..219748185 100644
--- a/Makefile
+++ b/Makefile
@@ -26,7 +26,7 @@ USE_CUDNN ?= 0
 ifeq ($(SHELL_UNAME), Linux)
     NVCC_ARCH := $(shell which nvidia-smi > /dev/null 2>&1 && nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits | head -n 1 | sed 's/\.//g')
     ifdef NVCC_ARCH
-        NVCC_FLAGS += -gencode arch=compute_$(NVCC_ARCH),code=compute_$(NVCC_ARCH),code=sm_$(NVCC_ARCH)
+        NVCC_FLAGS += -gencode arch=compute_$(NVCC_ARCH),code=compute_$(NVCC_ARCH) -gencode arch=compute_$(NVCC_ARCH),code=sm_$(NVCC_ARCH)
     endif
 endif
 

From 9663719c628193e3a61403c6c2e49e7c335628b4 Mon Sep 17 00:00:00 2001
From: Ross Wheeler <ross@rosswheeler.com>
Date: Sat, 4 May 2024 23:24:41 -0700
Subject: [PATCH 017/172] Minor fixes for Make for cudnn and windows support

Tested on Ubuntu Linux 22.04 and Windows 11
---
 Makefile | 40 ++++++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/Makefile b/Makefile
index 04cbfbb2a..f67ba3f72 100644
--- a/Makefile
+++ b/Makefile
@@ -67,16 +67,16 @@ else
 endif
 
 # Check and include cudnn if available
-# Currently hard-coding a bunch of stuff here for Linux, todo make this better/nicer
+# You can override the path to cudnn frontend by setting CUDNN_FRONTEND_PATH=your_path on the make command line
 # You need cuDNN from: https://developer.nvidia.com/cudnn
-# Follow the apt-get instructions
+# Follow the apt-get instructions or Windows instructions to install the cuDNN library
 # And the cuDNN front-end from: https://github.com/NVIDIA/cudnn-frontend/tree/main
-# For this there is no installation, just download the repo to your home directory
+# For this there is no installation, just download the repo to your home directory or directory of your choice
 # and then we include it below (see currently hard-coded path assumed in home directory)
 ifeq ($(USE_CUDNN), 1)
   ifeq ($(SHELL_UNAME), Linux)
     # hard-coded path for now
-    CUDNN_FRONTEND_PATH := $(HOME)/cudnn-frontend/include
+    CUDNN_FRONTEND_PATH ?= $(HOME)/cudnn-frontend/include
     ifeq ($(shell [ -d $(CUDNN_FRONTEND_PATH) ] && echo "exists"), exists)
       $(info ✓ cuDNN found, will run with flash-attention)
       NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH)
@@ -87,7 +87,19 @@ ifeq ($(USE_CUDNN), 1)
       $(error ✗ cuDNN not found. See the Makefile for our currently hard-coded paths / install instructions)
     endif
   else
-    $(info → cuDNN is not supported right now outside of Linux)
+    ifneq ($(OS), Windows_NT)
+      $(info → cuDNN is not supported on MAC OS right now)
+    else
+      $(info ✓ Windows cuDNN found, will run with flash-attention)
+      CUDNN_FRONTEND_PATH ?= ..\..\cudnn-frontend\include #override on command line if different location
+      CUDNN_INCLUDE_PATH ?= -I"C:\Program Files\NVIDIA\CUDNN\v9.1\include\12.4"
+      CUDNN_FRONTEND_PATH += $(CUDNN_INCLUDE_PATH)
+      NVCC_FLAGS += --std c++20 -Xcompiler "/std:c++20" -Xcompiler "/EHsc /W0 /nologo /Ox /FS" -maxrregcount=0 --machine 64
+      NVCC_CUDNN = cudnn_att.obj
+      NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH)
+      NVCC_LDFLAGS += -L"C:\Program Files\NVIDIA\CUDNN\v9.1\lib\12.4\x64" -lcudnn 
+      NVCC_FLAGS += -DENABLE_CUDNN
+    endif
   endif
 else
   $(info → cuDNN is manually disabled by default, run make with `USE_CUDNN=1` to try to enable)
@@ -191,28 +203,28 @@ $(info ---------------------------------------------)
 all: $(TARGETS)
 
 train_gpt2: train_gpt2.c
-	$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $< $(LDLIBS) $(OUTPUT_FILE)
+	$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $^ $(LDLIBS) $(OUTPUT_FILE)
 
 test_gpt2: test_gpt2.c
-	$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $< $(LDLIBS) $(OUTPUT_FILE)
+	$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $^ $(LDLIBS) $(OUTPUT_FILE)
 
-cudnn_att.o: cudnn_att.cu
-	$(NVCC) -c $(NVCC_FLAGS) $(PFLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS)
+$(NVCC_CUDNN): cudnn_att.cu
+	$(NVCC) -c $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_INCLUDES) 
 
 train_gpt2cu: train_gpt2.cu $(NVCC_CUDNN)
-	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) $(NVCC_CUDNN)
+	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) 
 
 train_gpt2fp32cu: train_gpt2_fp32.cu
-	$(NVCC) $(NVCC_FLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
+	$(NVCC) $(NVCC_FLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
 
 test_gpt2cu: test_gpt2.cu $(NVCC_CUDNN)
-	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) $(NVCC_CUDNN)
+	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) 
 
 test_gpt2fp32cu: test_gpt2_fp32.cu
-	$(NVCC) $(NVCC_FLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
+	$(NVCC) $(NVCC_FLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
 
 profile_gpt2cu: profile_gpt2.cu $(NVCC_CUDNN)
-	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS)  $(CUDA_OUTPUT_FILE) $(NVCC_CUDNN)
+	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS)  $(CUDA_OUTPUT_FILE) 
 
 clean:
 	$(REMOVE_FILES) $(TARGETS)

From 2d4e5fd840e9c456c459a5890b40c9bb1746b737 Mon Sep 17 00:00:00 2001
From: Ross Wheeler <ross@rosswheeler.com>
Date: Sat, 4 May 2024 23:24:41 -0700
Subject: [PATCH 018/172] Minor fixes for Makefile for cudnn and windows
 support

Tested on Ubuntu Linux 22.04 and Windows 11
---
 Makefile | 40 ++++++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/Makefile b/Makefile
index 04cbfbb2a..f67ba3f72 100644
--- a/Makefile
+++ b/Makefile
@@ -67,16 +67,16 @@ else
 endif
 
 # Check and include cudnn if available
-# Currently hard-coding a bunch of stuff here for Linux, todo make this better/nicer
+# You can override the path to cudnn frontend by setting CUDNN_FRONTEND_PATH=your_path on the make command line
 # You need cuDNN from: https://developer.nvidia.com/cudnn
-# Follow the apt-get instructions
+# Follow the apt-get instructions or Windows instructions to install the cuDNN library
 # And the cuDNN front-end from: https://github.com/NVIDIA/cudnn-frontend/tree/main
-# For this there is no installation, just download the repo to your home directory
+# For this there is no installation, just download the repo to your home directory or directory of your choice
 # and then we include it below (see currently hard-coded path assumed in home directory)
 ifeq ($(USE_CUDNN), 1)
   ifeq ($(SHELL_UNAME), Linux)
     # hard-coded path for now
-    CUDNN_FRONTEND_PATH := $(HOME)/cudnn-frontend/include
+    CUDNN_FRONTEND_PATH ?= $(HOME)/cudnn-frontend/include
     ifeq ($(shell [ -d $(CUDNN_FRONTEND_PATH) ] && echo "exists"), exists)
       $(info ✓ cuDNN found, will run with flash-attention)
       NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH)
@@ -87,7 +87,19 @@ ifeq ($(USE_CUDNN), 1)
       $(error ✗ cuDNN not found. See the Makefile for our currently hard-coded paths / install instructions)
     endif
   else
-    $(info → cuDNN is not supported right now outside of Linux)
+    ifneq ($(OS), Windows_NT)
+      $(info → cuDNN is not supported on MAC OS right now)
+    else
+      $(info ✓ Windows cuDNN found, will run with flash-attention)
+      CUDNN_FRONTEND_PATH ?= ..\..\cudnn-frontend\include #override on command line if different location
+      CUDNN_INCLUDE_PATH ?= -I"C:\Program Files\NVIDIA\CUDNN\v9.1\include\12.4"
+      CUDNN_FRONTEND_PATH += $(CUDNN_INCLUDE_PATH)
+      NVCC_FLAGS += --std c++20 -Xcompiler "/std:c++20" -Xcompiler "/EHsc /W0 /nologo /Ox /FS" -maxrregcount=0 --machine 64
+      NVCC_CUDNN = cudnn_att.obj
+      NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH)
+      NVCC_LDFLAGS += -L"C:\Program Files\NVIDIA\CUDNN\v9.1\lib\12.4\x64" -lcudnn 
+      NVCC_FLAGS += -DENABLE_CUDNN
+    endif
   endif
 else
   $(info → cuDNN is manually disabled by default, run make with `USE_CUDNN=1` to try to enable)
@@ -191,28 +203,28 @@ $(info ---------------------------------------------)
 all: $(TARGETS)
 
 train_gpt2: train_gpt2.c
-	$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $< $(LDLIBS) $(OUTPUT_FILE)
+	$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $^ $(LDLIBS) $(OUTPUT_FILE)
 
 test_gpt2: test_gpt2.c
-	$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $< $(LDLIBS) $(OUTPUT_FILE)
+	$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $^ $(LDLIBS) $(OUTPUT_FILE)
 
-cudnn_att.o: cudnn_att.cu
-	$(NVCC) -c $(NVCC_FLAGS) $(PFLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS)
+$(NVCC_CUDNN): cudnn_att.cu
+	$(NVCC) -c $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_INCLUDES) 
 
 train_gpt2cu: train_gpt2.cu $(NVCC_CUDNN)
-	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) $(NVCC_CUDNN)
+	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) 
 
 train_gpt2fp32cu: train_gpt2_fp32.cu
-	$(NVCC) $(NVCC_FLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
+	$(NVCC) $(NVCC_FLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
 
 test_gpt2cu: test_gpt2.cu $(NVCC_CUDNN)
-	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) $(NVCC_CUDNN)
+	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) 
 
 test_gpt2fp32cu: test_gpt2_fp32.cu
-	$(NVCC) $(NVCC_FLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
+	$(NVCC) $(NVCC_FLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
 
 profile_gpt2cu: profile_gpt2.cu $(NVCC_CUDNN)
-	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS)  $(CUDA_OUTPUT_FILE) $(NVCC_CUDNN)
+	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS)  $(CUDA_OUTPUT_FILE) 
 
 clean:
 	$(REMOVE_FILES) $(TARGETS)

From 9910a4086394ba5498e4d0191a0f058720324c47 Mon Sep 17 00:00:00 2001
From: ademeure <arundemeure@gmail.com>
Date: Sun, 5 May 2024 14:42:32 +0100
Subject: [PATCH 019/172] Removed makefile change so we can integrate #339
 instead which feels slightly cleaner

---
 Makefile | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/Makefile b/Makefile
index 219748185..04cbfbb2a 100644
--- a/Makefile
+++ b/Makefile
@@ -22,13 +22,6 @@ NVCC_CUDNN =
 # overridable flag for multi-GPU training. by default we won't build with cudnn
 # because it bloats up the compile time from a few seconds to ~minute
 USE_CUDNN ?= 0
-# on linux, try to use nvidia-smi to detect the user's GPU and compile for that specific architecture
-ifeq ($(SHELL_UNAME), Linux)
-    NVCC_ARCH := $(shell which nvidia-smi > /dev/null 2>&1 && nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits | head -n 1 | sed 's/\.//g')
-    ifdef NVCC_ARCH
-        NVCC_FLAGS += -gencode arch=compute_$(NVCC_ARCH),code=compute_$(NVCC_ARCH) -gencode arch=compute_$(NVCC_ARCH),code=sm_$(NVCC_ARCH)
-    endif
-endif
 
 # autodect a lot of various supports on current platform
 $(info ---------------------------------------------)

From 876ab93c0a9df1c1b3765f3df3a927364685830b Mon Sep 17 00:00:00 2001
From: ademeure <arundemeure@gmail.com>
Date: Sun, 5 May 2024 15:05:59 +0100
Subject: [PATCH 020/172] Add FP16 path for atomicStochasticAdd (+remove
 __bfloat1622float2 to work on older CUDAs) + fixes

---
 cudnn_att.cu    | 20 +++++++++++---------
 profile_gpt2.cu |  2 +-
 test_gpt2.cu    |  2 +-
 train_gpt2.cu   | 26 +++++++++++++++-----------
 4 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/cudnn_att.cu b/cudnn_att.cu
index 4664d1827..fd9760b1a 100644
--- a/cudnn_att.cu
+++ b/cudnn_att.cu
@@ -34,15 +34,17 @@ static void cudaCheck(cudaError_t error, const char *file, int line) {
 #define cudaCheck(err) (cudaCheck(err, __FILE__, __LINE__))
 
 // Profiler utils
-class NvtxRange {
- public:
-    NvtxRange(const char* s) { nvtxRangePush(s); }
-    NvtxRange(const std::string& base_str, int number) {
-        std::string range_string = base_str + " " + std::to_string(number);
-        nvtxRangePush(range_string.c_str());
-    }
-    ~NvtxRange() { nvtxRangePop(); }
-};
+namespace {
+    class NvtxRange {
+    public:
+        NvtxRange(const char* s) { nvtxRangePush(s); }
+        NvtxRange(const std::string& base_str, int number) {
+            std::string range_string = base_str + " " + std::to_string(number);
+            nvtxRangePush(range_string.c_str());
+        }
+        ~NvtxRange() { nvtxRangePop(); }
+    };
+}
 #define NVTX_RANGE_FN() NvtxRange nvtx_range(__FUNCTION__)
 
 static cudnnHandle_t cudnn_handle;
diff --git a/profile_gpt2.cu b/profile_gpt2.cu
index 8c12628e5..c29cd6a08 100644
--- a/profile_gpt2.cu
+++ b/profile_gpt2.cu
@@ -28,7 +28,7 @@ the profile.ncu-rep from a cloud box to local to pretty view.
 #include "train_gpt2.cu"
 
 int main() {
-    common_start();
+    common_start(true, true);
 
     // build the GPT-2 model from a checkpoint
     GPT2 model;
diff --git a/test_gpt2.cu b/test_gpt2.cu
index 3fc6b6f0e..d7944125c 100644
--- a/test_gpt2.cu
+++ b/test_gpt2.cu
@@ -83,7 +83,7 @@ float* float_cpu_malloc_and_point_parameters(FloatParameterTensors* params, size
 }
 
 int main(int argc, char *argv[]) {
-    common_start(false);
+    common_start(false, true);
 
     // build the GPT-2 model from a checkpoint
     GPT2 model;
diff --git a/train_gpt2.cu b/train_gpt2.cu
index dbc25677b..1a1a53752 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -473,19 +473,21 @@ __global__ void encoder_forward_kernel3(floatX* out,
     store128(out_btc, packed_out);
 }
 
-__device__ void atomicStochasticAdd(__nv_bfloat16* address, float val0, float val1, uint seed) {
+template <typename T>
+__device__ void atomicStochasticAdd(T* address, float val0, float val1, uint seed) {
+    static_assert(sizeof(T) == 2, "Only 16-bit atomicStochasticAdd supported.");
     float2 val = make_float2(val0, val1);
     uint* address_as_uint = (uint*)address;
     uint old = *address_as_uint, assumed;
     uint random = Get2dNoiseUint(threadIdx.x, blockIdx.x, seed);
     do {
         assumed = old;
-        float2 old_fp32 = __bfloat1622float2(*(__nv_bfloat162*)&old);
-        float2 new_fp32 = make_float2(old_fp32.x + val.x, old_fp32.y + val.y);
-        __nv_bfloat162 new_bf16;
-        stochastic_rounding(new_fp32.x, &new_bf16.x, random);
-        stochastic_rounding(new_fp32.y, &new_bf16.y, random >> 16);
-        old = atomicCAS(address_as_uint, assumed, *(uint*)&new_bf16);
+        float2 new_fp32 = make_float2((float)(reinterpret_cast<T*>(&old)[0]) + val.x,
+                                      (float)(reinterpret_cast<T*>(&old)[1]) + val.y);
+        T new_rounded[2];
+        stochastic_rounding(new_fp32.x, &new_rounded[0], random);
+        stochastic_rounding(new_fp32.y, &new_rounded[1], random >> 16);
+        old = atomicCAS(address_as_uint, assumed, *(uint*)&new_rounded);
     } while (assumed != old);
 }
 __device__ void atomicStochasticAdd(float* address, float val0, float val1, uint seed) {
@@ -2074,12 +2076,14 @@ void gpt2_free(GPT2 *model) {
 
 // ----------------------------------------------------------------------------
 // common init & free code for train/test/profile
-void common_start(bool override_enable_tf32 = true) {
+void common_start(bool override_enable_tf32 = true, bool print_device_info = true) {
     int deviceIdx = 0;
     cudaCheck(cudaSetDevice(deviceIdx));
     cudaGetDeviceProperties(&deviceProp, deviceIdx);
-    printf("[System]\n");
-    printf("Device %d: %s\n", deviceIdx, deviceProp.name);
+    if (print_device_info) {
+        printf("[System]\n");
+        printf("Device %d: %s\n", deviceIdx, deviceProp.name);
+    }
 
     cudaCheck(cudaStreamCreate(&main_stream));
     cudaEventCreateWithFlags(&main_event, cudaEventDisableTiming);
@@ -2335,7 +2339,7 @@ int main(int argc, char *argv[]) {
     printf0("| use_master_weights    | %-50s |\n", use_master_weights ? "enabled" : "disabled");
     printf0("+-----------------------+----------------------------------------------------+\n");
 
-    common_start(override_enable_tf32); // common init code for train/test/profile
+    common_start(override_enable_tf32, false); // common init code for train/test/profile
 
     const char* precision_str = (PRECISION_MODE == PRECISION_FP32)
                               ? (cublas_compute == CUBLAS_COMPUTE_32F_FAST_TF32 ? "TF32" : "FP32")

From 804a9af1dc2cc4a387dacfa28aba658ce14262a6 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Sun, 5 May 2024 23:43:02 +0300
Subject: [PATCH 021/172] make things compile with nvcc11

---
 train_gpt2.cu | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 1a1a53752..86ceb5caa 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -167,6 +167,21 @@ void mpi_check(int status, const char *file, int line) {
 #define mpiCheck(err) (mpi_check(err, __FILE__, __LINE__))
 #endif
 
+// older nvcc does not provide __ldcs and __stcs for bfloat16, despite these actually just being unsigned shorts.
+// we need to be careful here to only define our own versions if none already exist, otherwise the compiler will
+// complain.
+// If not, you easily get "no viable overload" (for sm52) and "function already exists" (sm_80)
+#if defined(ENABLE_BF16) and __CUDACC_VER_MAJOR__ < 12 and not(__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+__device__ floatX __ldcs(const floatX* address) {
+    unsigned short bf = __ldcs(reinterpret_cast<const unsigned short*>(address));
+    return __nv_bfloat16_raw{bf};
+}
+
+__device__ void __stcs(floatX* address, floatX value) {
+    __stcs(reinterpret_cast<unsigned short*>(address), ((__nv_bfloat16_raw)value).x);
+}
+#endif
+
 // warp-level reduction for summing values
 __device__ float warpReduceSum(float val) {
     for (int offset = 16; offset > 0; offset /= 2) {
@@ -1056,7 +1071,7 @@ __global__ void copy_and_cast_kernel(float* dst, const floatX* src, size_t n) {
 __global__ void cast_and_add_kernel(floatX* dst, const float* src, size_t n) {
     // used only for matmul_backward_bias kernel, a little bit embarassing TODO delete later
     const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < n) { dst[idx] += (floatX)src[idx]; } // have to += because dbias is a paramater
+    if (idx < n) { dst[idx] = (floatX)((float)dst[idx] + src[idx]); } // have to += because dbias is a paramater
 }
 
 // ----------------------------------------------------------------------------

From bbfe8c989c06c145f33c6b3890cdc688dc2dca2c Mon Sep 17 00:00:00 2001
From: lancer <tangshao28@gmail.com>
Date: Sun, 5 May 2024 16:59:58 -0700
Subject: [PATCH 022/172] Minor update on the code

---
 dev/cuda/classifier_fused.cu | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/dev/cuda/classifier_fused.cu b/dev/cuda/classifier_fused.cu
index df6894113..9202c2cee 100644
--- a/dev/cuda/classifier_fused.cu
+++ b/dev/cuda/classifier_fused.cu
@@ -5,7 +5,7 @@ much of a restriction: In pretraining, it is just a constant 1/batch_size tensor
 out the input prompt, but that is known in advance.
 
 Compile example:
-nvcc -O3 --use_fast_math classifier_fused.cu -o classifier_fused
+nvcc -O3 --use_fast_math -lcublas -lcublasLt classifier_fused.cu -o classifier_fused
 
 ./classifier_fused 1
 ./classifier_fused 2
@@ -198,7 +198,7 @@ __device__ SoftmaxParams prepare_softmax_blockwide(cg::thread_block_tile<32>& wa
     float thread_sumval = 0.0f;
     // do the loop in reverse to maximise probability of L2 cache hits
     // so even small L2s get some hits on the 2nd read of the same thread
-    for (int i = (V+3)/4 + threadIdx.x - blockDim.x; i >= 0; i -= blockDim.x) {
+    for (int i = ceil_div(V, 4) + threadIdx.x - blockDim.x; i >= 0; i -= blockDim.x) {
         float4 v4 = x_vec4[i];
         #pragma unroll
         for(int k = 0; k < 4; k++) {
@@ -207,7 +207,7 @@ __device__ SoftmaxParams prepare_softmax_blockwide(cg::thread_block_tile<32>& wa
             }
             float old_maxval = thread_maxval;
             thread_maxval = fmaxf(thread_maxval, vec_at(v4, k));
-            thread_sumval *= expf((old_maxval - thread_maxval));
+            thread_sumval *= expf(old_maxval - thread_maxval);
             thread_sumval += expf(vec_at(v4, k) - thread_maxval);
         }
     }
@@ -270,7 +270,7 @@ __global__ void fused_classifier_kernel2(float* dlogits, float* losses, float* p
     // calculate the gradients directly, saves bandwidth from probs during training
     // but also supports writing probs for inference-only and debugging
     const float4* logits_vec4 = reinterpret_cast<const float4*>(logits + idx * P);
-    for (int i = threadIdx.x; i < (V+3)/4; i += blockDim.x) {
+    for (int i = threadIdx.x; i < ceil_div(V, 4); i += blockDim.x) {
         // this is the 2nd read of logits after the one in prepare_softmax2
         // this data will never be needed again, so we reduce cache persistence
         float4 v4 = __ldcs(&logits_vec4[i]);
@@ -307,7 +307,7 @@ __device__ SoftmaxParams prepare_softmax_blockwide_nofloat4(cg::thread_block_til
         float v = x[i];
         float old_maxval = thread_maxval;
         thread_maxval = fmaxf(thread_maxval, v);
-        thread_sumval *= expf((old_maxval - thread_maxval));
+        thread_sumval *= expf(old_maxval - thread_maxval);
         thread_sumval += expf(v - thread_maxval);
     }
 
@@ -390,16 +390,16 @@ __device__ SoftmaxParams prepare_softmax_blockwide2(int idx, const float* inp, i
     float thread_sumval = 0.0f;
     // do the loop in reverse to maximise probability of L2 cache hits
     // so even small L2s get some hits on the 2nd read of the same thread
-    for (int i = (V+3)/4 + threadIdx.x - blockDim.x; i >= 0; i -= blockDim.x) {
+    for (int i = ceil_div(V, f128::size) + threadIdx.x - blockDim.x; i >= 0; i -= blockDim.x) {
         f128 packed_x = load128cs(x + i * f128::size); // load and do not keep in cache
         for(int k = 0; k < packed_x.size; ++k) {
-            if (i*4+k >= V) {  // bounds checking against real V
+            if (i*f128::size+k >= V) {  // bounds checking against real V
                 continue;
             }
             float v = (float)packed_x[k];
             float old_maxval = thread_maxval;
             thread_maxval = fmaxf(thread_maxval, v);
-            thread_sumval *= expf((old_maxval - thread_maxval));
+            thread_sumval *= expf(old_maxval - thread_maxval);
             thread_sumval += expf(v - thread_maxval);
         }
     }
@@ -457,7 +457,7 @@ __global__ void fused_classifier_kernel4(float* dlogits, float* losses, float* p
     // calculate the gradients directly, saves bandwidth from probs during training
     // but also supports writing probs for inference-only and debugging
     const float* logits_vec = logits + idx * P;
-    for (int i = threadIdx.x; i < (V+f128::size-1)/f128::size; i += blockDim.x) {
+    for (int i = threadIdx.x; i < ceil_div(V , f128::size); i += blockDim.x) {
         // this is the 2nd read of logits after the one in prepare_softmax2
         // this data will never be needed again, so we reduce cache persistence
         f128 packed_logits_vec = load128cs(logits_vec + i * f128::size); // load and do not keep in cache

From 2bfd2b6a7e3881d65414b50fdd87d37656b12f91 Mon Sep 17 00:00:00 2001
From: Ross Wheeler <ross@rosswheeler.com>
Date: Mon, 6 May 2024 00:46:28 -0700
Subject: [PATCH 023/172] Auto-detect GPU capability

3 cases tested on Windows 11 and Ubuntu 22.04
---
 Makefile | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/Makefile b/Makefile
index 04cbfbb2a..3ec092466 100644
--- a/Makefile
+++ b/Makefile
@@ -23,6 +23,23 @@ NVCC_CUDNN =
 # because it bloats up the compile time from a few seconds to ~minute
 USE_CUDNN ?= 0
 
+# Function to check if a file exists in the PATH
+define file_exists_in_path
+  $(shell where $(1) 2>nul || which $(1) 2>/dev/null)
+endef
+
+ifndef GPU_COMPUTE_CAPABILITY # set to defaults if: make GPU_COMPUTE_CAPABILITY= 
+  ifneq ($(call file_exists_in_path, __nvcc_device_query),)
+    GPU_COMPUTE_CAPABILITY = $(shell __nvcc_device_query) 
+    GPU_COMPUTE_CAPABILITY := $(strip $(GPU_COMPUTE_CAPABILITY))
+  endif
+endif
+
+# set to defaults if - make GPU_COMPUTE_CAPABILITY= otherwise use the compute capability detected above
+ifneq ($(GPU_COMPUTE_CAPABILITY),) 
+  NVCC_FLAGS += --generate-code arch=compute_$(GPU_COMPUTE_CAPABILITY),code=[compute_$(GPU_COMPUTE_CAPABILITY),sm_$(GPU_COMPUTE_CAPABILITY)]
+endif
+
 # autodect a lot of various supports on current platform
 $(info ---------------------------------------------)
 

From 0c4908d8153e051d5d33deff6a3d799da973041d Mon Sep 17 00:00:00 2001
From: Ross Wheeler <ross@rosswheeler.com>
Date: Mon, 6 May 2024 01:11:27 -0700
Subject: [PATCH 024/172] Adding CI check to disable auto-detect

---
 Makefile | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 3ec092466..114568910 100644
--- a/Makefile
+++ b/Makefile
@@ -28,10 +28,12 @@ define file_exists_in_path
   $(shell where $(1) 2>nul || which $(1) 2>/dev/null)
 endef
 
-ifndef GPU_COMPUTE_CAPABILITY # set to defaults if: make GPU_COMPUTE_CAPABILITY= 
-  ifneq ($(call file_exists_in_path, __nvcc_device_query),)
-    GPU_COMPUTE_CAPABILITY = $(shell __nvcc_device_query) 
-    GPU_COMPUTE_CAPABILITY := $(strip $(GPU_COMPUTE_CAPABILITY))
+ifneq ($(CI),true) # if not in CI, then use the GPU query
+  ifndef GPU_COMPUTE_CAPABILITY # set to defaults if: make GPU_COMPUTE_CAPABILITY= 
+    ifneq ($(call file_exists_in_path, __nvcc_device_query),)
+      GPU_COMPUTE_CAPABILITY = $(shell __nvcc_device_query) 
+      GPU_COMPUTE_CAPABILITY := $(strip $(GPU_COMPUTE_CAPABILITY))
+    endif
   endif
 endif
 

From 7a8f471bc77f90ac8385ad968721998cddb7ecbb Mon Sep 17 00:00:00 2001
From: chinthysl <chin.dev.acc@gmail.com>
Date: Mon, 6 May 2024 17:49:58 +0800
Subject: [PATCH 025/172] further reorganization

---
 profile_gpt2.cu |  2 +-
 test_gpt2.cu    |  2 +-
 train_gpt2.cu   | 74 ++++++++++++++++++++++++++++++++++---------------
 3 files changed, 53 insertions(+), 25 deletions(-)

diff --git a/profile_gpt2.cu b/profile_gpt2.cu
index 97ea3ce09..c29cd6a08 100644
--- a/profile_gpt2.cu
+++ b/profile_gpt2.cu
@@ -53,7 +53,7 @@ int main() {
     gpt2_forward(&model, x, y, B, T);
     gpt2_zero_grad(&model);
     gpt2_backward(&model);
-    gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1, model.num_parameters, 0);
+    gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1);
     cudaCheck(cudaDeviceSynchronize()); // finish all CUDA work to get correct precise timings
 
     // free
diff --git a/test_gpt2.cu b/test_gpt2.cu
index dfe5486cc..d7944125c 100644
--- a/test_gpt2.cu
+++ b/test_gpt2.cu
@@ -255,7 +255,7 @@ int main(int argc, char *argv[]) {
             allok = allok & check_tensor(tensors1[15], tensors2[15], C, "lnfb", 3e-2f);
         }
 
-        gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.01f, step+1, model.num_parameters, 0);
+        gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.01f, step+1);
 
         // print the timing information at the end
         printf("step %d: loss %f (took %f ms)\n", step+1, model.mean_loss, time_elapsed_s * 1000);
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 0c28eff4f..adcd2812f 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2088,6 +2088,7 @@ void gpt2_backward(GPT2 *model) {
 // Compute a mean of a single CPU value across all GPU processes. No-op when multi-GPU is disabled.
 float multi_gpu_cpu_float_mean(float value, const MultiGpuConfig* multi_gpu_config) {
 #ifdef MULTI_GPU
+    if (multi_gpu_config->num_processes == 1) return value;
     // MPI doesn't support all reduce with mean, so we sum up, then divide.
     float result;
     mpiCheck(MPI_Allreduce(&value, &result, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD));
@@ -2104,11 +2105,7 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) {
     // Average all losses.
     model->accumulated_mean_loss = multi_gpu_cpu_float_mean(model->mean_loss, multi_gpu_config);
 #ifdef MULTI_GPU
-    // all gather is only required when num_processes > 1
-    if (multi_gpu_config->num_processes == 1) {
-        return;
-    }
-
+    if (multi_gpu_config->num_processes == 1) return;
     // Average all gradients.
     ncclCheck(ncclAllReduce(model->grads_memory, model->grads_memory,
         model->num_parameters,
@@ -2119,18 +2116,18 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) {
 #endif
 }
 
-void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t, size_t shard_num_parameters, size_t shard_offset) {
+void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t) {
     NVTX_RANGE_FN();
     // reference: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
 
-    // lazily allocate the memory for m_memory and v_memory according to shard configs
+    // lazily allocate the memory for m_memory and v_memory
     if (model->m_memory == NULL) {
-        cudaCheck(cudaMalloc((void**)&model->m_memory, shard_num_parameters * sizeof(float)));
-        cudaCheck(cudaMalloc((void**)&model->v_memory, shard_num_parameters * sizeof(float)));
-        cudaCheck(cudaMemset(model->m_memory, 0, shard_num_parameters * sizeof(float)));
-        cudaCheck(cudaMemset(model->v_memory, 0, shard_num_parameters * sizeof(float)));
-        printf0("allocated %zu MiB for AdamW optimizer state m\n", (shard_num_parameters * sizeof(float)) >> 20);
-        printf0("allocated %zu MiB for AdamW optimizer state v\n", (shard_num_parameters * sizeof(float)) >> 20);
+        cudaCheck(cudaMalloc((void**)&model->m_memory, model->num_parameters * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&model->v_memory, model->num_parameters * sizeof(float)));
+        cudaCheck(cudaMemset(model->m_memory, 0, model->num_parameters * sizeof(float)));
+        cudaCheck(cudaMemset(model->v_memory, 0, model->num_parameters * sizeof(float)));
+        printf0("allocated %zu MiB for AdamW optimizer state m\n", (model->num_parameters * sizeof(float)) >> 20);
+        printf0("allocated %zu MiB for AdamW optimizer state v\n", (model->num_parameters * sizeof(float)) >> 20);
         if (model->use_master_weights == 1) {
             // allocate one more buffer to keep the master copy of weights as float, and copy the weights over
             cudaCheck(cudaMalloc((void**)&model->master_weights, model->num_parameters * sizeof(float)));
@@ -2140,19 +2137,49 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
         }
     }
 
-    floatX* params_memory = (floatX*)model->params_memory + shard_offset;
-    floatX* grads_memory = (floatX*)model->grads_memory + shard_offset;
+    int block_size = 512;
+    int num_blocks = CEIL_DIV(model->num_parameters, block_size);
+    float beta1_correction = 1.0f - powf(beta1, t);
+    float beta2_correction = 1.0f - powf(beta2, t);
+    unsigned int seed = random_u32(&model->rng_state);
+    adamw_kernel3<<<num_blocks, block_size, 0, main_stream>>>((floatX*)model->params_memory, model->master_weights,
+                                              (floatX*)model->grads_memory, model->m_memory, model->v_memory,
+                                              model->num_parameters,
+                                              learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed);
+    cudaCheck(cudaGetLastError());
+}
+
+void gpt2_multi_gpu_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t, MultiGpuConfig* multi_gpu_config) {
+    NVTX_RANGE_FN();
+    if (model->m_memory == NULL) {
+        cudaCheck(cudaMalloc((void**)&model->m_memory, multi_gpu_config->shard_num_parameters * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&model->v_memory, multi_gpu_config->shard_num_parameters* sizeof(float)));
+        cudaCheck(cudaMemset(model->m_memory, 0, multi_gpu_config->shard_num_parameters * sizeof(float)));
+        cudaCheck(cudaMemset(model->v_memory, 0, multi_gpu_config->shard_num_parameters * sizeof(float)));
+        printf0("allocated %zu MiB for AdamW optimizer state m\n", (multi_gpu_config->shard_num_parameters * sizeof(float)) >> 20);
+        printf0("allocated %zu MiB for AdamW optimizer state v\n", (multi_gpu_config->shard_num_parameters * sizeof(float)) >> 20);
+        if (model->use_master_weights == 1) {
+            cudaCheck(cudaMalloc((void**)&model->master_weights, model->num_parameters * sizeof(float)));
+            copy_and_cast_kernel<<<CEIL_DIV(model->num_parameters, 512), 512, 0, main_stream>>>(model->master_weights, (floatX*)model->params_memory, model->num_parameters);
+            cudaCheck(cudaGetLastError());
+            printf0("allocated %zu MiB for master copy of params\n", (model->num_parameters * sizeof(float)) >> 20);
+        }
+    }
+
+    floatX* params_memory = (floatX*)model->params_memory + multi_gpu_config->shard_offset;
+    floatX* grads_memory = (floatX*)model->grads_memory + multi_gpu_config->shard_offset;
     float* master_weights = NULL;
     if (model->use_master_weights == 1) {
-        master_weights = model->master_weights + shard_offset;
+        master_weights = model->master_weights + multi_gpu_config->shard_offset;
     }
 
     int block_size = 512;
-    int num_blocks = CEIL_DIV(shard_num_parameters, block_size);
+    int num_blocks = CEIL_DIV(multi_gpu_config->shard_num_parameters, block_size);
     float beta1_correction = 1.0f - powf(beta1, t);
     float beta2_correction = 1.0f - powf(beta2, t);
     unsigned int seed = random_u32(&model->rng_state);
-    adamw_kernel3<<<num_blocks, block_size, 0, main_stream>>>(params_memory, master_weights, grads_memory, model->m_memory, model->v_memory, shard_num_parameters,
+    adamw_kernel3<<<num_blocks, block_size, 0, main_stream>>>(params_memory, master_weights, grads_memory,
+                                                              model->m_memory, model->v_memory, multi_gpu_config->shard_num_parameters,
                                                               learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed);
     cudaCheck(cudaGetLastError());
 }
@@ -2160,10 +2187,7 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
 void gpt2_multi_gpu_gather(GPT2 *model, MultiGpuConfig* multi_gpu_config)
 {
 #ifdef MULTI_GPU
-    // all gather is only required when num_processes > 1
-    if (multi_gpu_config->num_processes == 1) {
-        return;
-    }
+    if (multi_gpu_config->num_processes == 1) return;
 
     if (multi_gpu_config->zero_stage == 1) {
         // gather all parameter updates from each process
@@ -2616,9 +2640,13 @@ int main(int argc, char *argv[]) {
         gpt2_forward(&model, train_loader.inputs, train_loader.targets, B, T, false);
         gpt2_zero_grad(&model);
         gpt2_backward(&model);
+#ifndef MULTI_GPU        
+        gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1);
+#else
         gpt2_multi_gpu_accumulate(&model, &multi_gpu_config);
-        gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1, multi_gpu_config.shard_num_parameters, multi_gpu_config.shard_offset);
+        gpt2_multi_gpu_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1, &multi_gpu_config);
         gpt2_multi_gpu_gather(&model, &multi_gpu_config);
+#endif
 
         // todo - move or double-buffer all of this timing logic to avoid idling the GPU at this point!
         cudaEventRecord(end);

From 134f4c7fc8e9092f09d30ad15efa979600e4e656 Mon Sep 17 00:00:00 2001
From: Ross Wheeler <ross@rosswheeler.com>
Date: Mon, 6 May 2024 12:38:40 -0700
Subject: [PATCH 026/172] Adding two directory search for cuDNN frontend files

Search in $(HOME) and in the current directory.
---
 Makefile | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/Makefile b/Makefile
index 253e5d986..c4879588b 100644
--- a/Makefile
+++ b/Makefile
@@ -94,23 +94,33 @@ endif
 # and then we include it below (see currently hard-coded path assumed in home directory)
 ifeq ($(USE_CUDNN), 1)
   ifeq ($(SHELL_UNAME), Linux)
-    # hard-coded path for now
-    CUDNN_FRONTEND_PATH ?= $(HOME)/cudnn-frontend/include
-    ifeq ($(shell [ -d $(CUDNN_FRONTEND_PATH) ] && echo "exists"), exists)
+    # hard-coded path for now in either . or ($HOME) directory 
+    # this can be overridden by setting CUDNN_FRONTEND_PATH on the command line
+    ifeq ($(shell [ -d $(HOME)/cudnn-frontend/include ] && echo "exists"), exists)
       $(info ✓ cuDNN found, will run with flash-attention)
-      NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH)
-      NVCC_LDFLAGS += -lcudnn
-      NVCC_FLAGS += -DENABLE_CUDNN
-      NVCC_CUDNN = cudnn_att.o
+      CUDNN_FRONTEND_PATH ?= $(HOME)/cudnn-frontend/include
+    else ifeq ($(shell [ -d cudnn-frontend/include ] && echo "exists"),)
+      $(info ✓ cuDNN found, will run with flash-attention)
+      CUDNN_FRONTEND_PATH ?= cudnn-frontend/include
     else
       $(error ✗ cuDNN not found. See the Makefile for our currently hard-coded paths / install instructions)
     endif
-  else
+    NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH)
+    NVCC_LDFLAGS += -lcudnn
+    NVCC_FLAGS += -DENABLE_CUDNN
+    NVCC_CUDNN = cudnn_att.o
+  else 
     ifneq ($(OS), Windows_NT)
       $(info → cuDNN is not supported on MAC OS right now)
     else
       $(info ✓ Windows cuDNN found, will run with flash-attention)
-      CUDNN_FRONTEND_PATH ?= ..\..\cudnn-frontend\include #override on command line if different location
+      ifeq ($(shell if exist "$(HOMEDRIVE)$(HOMEPATH)\cudnn-frontend\include" (echo exists)),exists)
+        CUDNN_FRONTEND_PATH ?= $(HOMEDRIVE)$(HOMEPATH)\cudnn-frontend\include #override on command line if different location
+      else ifeq ($(shell if exist "cudnn-frontend\include" (echo exists)),exists)
+        CUDNN_FRONTEND_PATH ?= cudnn-frontend\include #override on command line if different location
+      else
+        $(error ✗ cuDNN not found. See the Makefile for our currently hard-coded paths / install instructions) 
+      endif
       CUDNN_INCLUDE_PATH ?= -I"C:\Program Files\NVIDIA\CUDNN\v9.1\include\12.4"
       CUDNN_FRONTEND_PATH += $(CUDNN_INCLUDE_PATH)
       NVCC_FLAGS += --std c++20 -Xcompiler "/std:c++20" -Xcompiler "/EHsc /W0 /nologo /Ox /FS" -maxrregcount=0 --machine 64
@@ -214,7 +224,7 @@ ifeq ($(NVCC),)
     $(info ✗ nvcc not found, skipping GPU/CUDA builds)
 else
     $(info ✓ nvcc found, including GPU/CUDA support)
-    TARGETS += train_gpt2cu test_gpt2cu train_gpt2fp32cu test_gpt2fp32cu
+    TARGETS += train_gpt2cu test_gpt2cu train_gpt2fp32cu test_gpt2fp32cu $(NVCC_CUDNN)
 endif
 
 $(info ---------------------------------------------)

From 69f6c4f765cdebceede2b32501d765b68fcc30cc Mon Sep 17 00:00:00 2001
From: Horace He <horacehe2007@yahoo.com>
Date: Mon, 6 May 2024 13:10:17 -0700
Subject: [PATCH 027/172] Don't return logits during training for PyTorch
 baseline

This improves perf somewhat, since currently it's always returning logits (which thus need to be materialized).
---
 train_gpt2.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/train_gpt2.py b/train_gpt2.py
index f2fa68c9b..1446e6165 100644
--- a/train_gpt2.py
+++ b/train_gpt2.py
@@ -147,12 +147,11 @@ def forward(self, idx, targets=None):
             # if we are given some desired targets also calculate the loss
             logits = self.lm_head(x)
             loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
+            return None, loss
         else:
             # inference-time mini-optimization: only forward the lm_head on the very last position
             logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
-            loss = None
-
-        return logits, loss
+            return logits, None
 
     @classmethod
     def from_pretrained(cls, model_type):

From 5adb6ef2c1223203bbfbe0a43f573dfa503c68e7 Mon Sep 17 00:00:00 2001
From: ademeure <arundemeure@gmail.com>
Date: Mon, 6 May 2024 21:24:42 +0100
Subject: [PATCH 028/172] Add tensor core and overall efficiency stats to
 profiler script.

---
 profile_gpt2cu.py | 44 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/profile_gpt2cu.py b/profile_gpt2cu.py
index 8e8847369..d9dbd4f8e 100644
--- a/profile_gpt2cu.py
+++ b/profile_gpt2cu.py
@@ -39,7 +39,8 @@
     "dram__bytes_write.sum",                    # DRAM writes
     "lts__t_sectors_srcunit_tex_op_read.sum",   # L2 reads (sectors -- 32B)
     "lts__t_sectors_srcunit_tex_op_write.sum",  # L2 reads (sectors -- 32B)
-    "smsp__inst_executed.sum",                   # instructions
+    "sm__pipe_tensor_op_hmma_cycles_active.avg.pct_of_peak_sustained_active", # % of peak tensor core utilization
+    "smsp__inst_executed.sum",                  # instructions
 ]
 cmd = [NCU, "-i", "profile.ncu-rep", "--csv", "--page", "raw", "--metrics", ",".join(metrics)]
 result = subprocess.check_output(cmd, text=True).strip()
@@ -72,16 +73,35 @@
 
 assert CLS_START != -1
 
+# Check every kernel to find the maximum DRAM bandwidth and Tensor Core utilisation values
+max_dram_bw = 0.0
+max_tensor = 0.0
+for rid, row in kernel_profile_data:
+    if rid <= 2:
+        continue
+    time = float(row[13])
+    read = float(row[11])
+    write = float(row[12])
+    tensor = float(row[16])
+    dram_bw = (read + write) / (time / 1000.0)
+    max_dram_bw = max(max_dram_bw, dram_bw)
+    max_tensor = max(max_tensor, tensor)
+
+# round the maximum tensor core utilisation to 50% or 100%
+# consumer GPUs can only achieve 50% of peak tensor throughput on this counter
+# and for GPUs without tensor cores, we set the value to 50% to avoid division by zero
+max_tensor = (max_tensor > 50.0) and 100.0 or 50.0
+
 print()
 print("Kernel calls:")
 for rid, row in kernel_profile_data:
     if rid == 0:
         #  headings
-        print(f"id pass    {'name':<40} {'time':>8} {'RAM rd':>8} {'RAM wt':>8} {'L2 rd':>8} {'L2 wt':>8} {'inst':>8}")
+        print(  f"id pass    {'name':<40} {'time':>8} {'RAM BW':>8} {'tensor':>8} {'RAM rd':>8} {'RAM wt':>8} {'L2 rd':>8} {'L2 wt':>8} {'inst':>8}")
         continue
     if rid == 1:
         # units
-        units = f"           {'':<40} {'ms':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'MInst':>8}"
+        units = f"           {'':<40} {'ms':>8} {'GB/s':>8} {'core %':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'MInst':>8}"
         print(units)
         print("." * len(units))
         continue
@@ -95,7 +115,9 @@
     write = float(row[12])
     l2_read = float(row[14])
     l2_write = float(row[15])
-    inst = float(row[16]) / 1e6
+    tensor = float(row[16])
+    inst = float(row[17]) / 1e6
+    dram_bw = (read + write) / (time / 1000.0)
 
     kid = rid - 2
 
@@ -149,6 +171,7 @@
     l2_read = l2_read * 32 / 1024 / 1024 / 1024
     l2_write = l2_write * 32 / 1024 / 1024 / 1024
 
+    efficiency = max(dram_bw / max_dram_bw, tensor / max_tensor)
     summaries[fn_name] += time
     counts[fn_name] += multiplier
     passes[pass_name] += time
@@ -159,13 +182,18 @@
         total['l2_read'] += l2_read
         total['l2_write'] += l2_write
         total['inst'] += inst
+        total['tensor'] += tensor * time # % so multiplied by time
+        total['efficiency'] += efficiency * time
 
     pass_info = f"{pass_name}×{multiplier}"
-    print(f"{kid:02} {pass_info:7} {fn_name:<40} {time:8.2f} {read:8.2f} {write:8.2f} {l2_read:8.2f} {l2_write:8.2f} {inst:8.2f}")
+    print(f"{kid:02} {pass_info:7} {fn_name:<40} {time:8.2f} {dram_bw:8.1f} {tensor:8.1f} {read:8.2f} {write:8.2f} {l2_read:8.2f} {l2_write:8.2f} {inst:8.2f}")
+
 
 total_time = total['time']
+avg_dram_bw = (total['read'] + total['write']) / (total_time / 1000.0)
+avg_tensor_util = total['tensor'] / total_time
 print("." * len(units))
-print(f"           {'Total':<40} {total['time']:8.2f} {total['read']:8.2f} {total['write']:8.2f} {total['l2_read']:8.2f} {total['l2_write']:8.2f} {total['inst']:8.2f}")
+print(f"           {'Total':<40} {total['time']:8.2f} {avg_dram_bw:8.1f} {avg_tensor_util:8.1f} {total['read']:8.2f} {total['write']:8.2f} {total['l2_read']:8.2f} {total['l2_write']:8.2f} {total['inst']:8.2f}")
 
 print()
 print("Kernel type summaries:")
@@ -192,5 +220,9 @@
 We read {total['read']:.1f}GiB ({total['read']/ts:.1f}GB/s) and write {total['write']:.1f}GiB ({total['write']/ts:.1f}GB/s) to DRAM,
 read {total['l2_read']:.1f}GiB ({total['l2_read']/ts:.1f}GB/s) and write {total['l2_write']:.1f}GiB ({total['l2_write']/ts:.1f}GB/s) to L2,
 and execute {total['inst'] / 1000:.1f} billion instructions ({total['inst'] / 1000 / ts:.1f} GInst/s).
+
+Assuming that every kernel should be either fully DRAM bandwidth or tensor core limited,
+with a peak DRAM bandwidth of {max_dram_bw:.1f}GB/s and a peak tensor throughput of {max_tensor:.1f}%,
+our overall efficiency is {(total['efficiency'] * 100.0 / total_time):.1f}%.
 """
 print(summary)
\ No newline at end of file

From f7d77600845537597654e4ace1ef0914ab413277 Mon Sep 17 00:00:00 2001
From: Ross Wheeler <ross@rosswheeler.com>
Date: Mon, 6 May 2024 13:56:17 -0700
Subject: [PATCH 029/172] Non-standard C syntax replace with standard C

Also, adding unistd.h back because that has some changes required to find M_PI.
---
 train_gpt2.cu | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 86ceb5caa..7a73cb612 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -34,6 +34,7 @@ This reads & runs in fp32, B=4, T=64, LR=1e-4, val/sample never (200),
 -a 1 is "overfit single batch", -x 10 is 10 iterations, and -f 0 disables tf32
 */
 
+#include <unistd.h>
 #include <stdio.h>
 #include <stdarg.h>
 #include <string>
@@ -53,6 +54,7 @@ This reads & runs in fp32, B=4, T=64, LR=1e-4, val/sample never (200),
 #include "utils.h"
 // defines: tokenizer_init, tokenizer_decode, tokenizer_free
 #include "tokenizer.h"
+#undef FLT_MAX
 
 // ----------------------------------------------------------------------------
 // CUDA precision settings
@@ -171,7 +173,7 @@ void mpi_check(int status, const char *file, int line) {
 // we need to be careful here to only define our own versions if none already exist, otherwise the compiler will
 // complain.
 // If not, you easily get "no viable overload" (for sm52) and "function already exists" (sm_80)
-#if defined(ENABLE_BF16) and __CUDACC_VER_MAJOR__ < 12 and not(__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+#if defined(ENABLE_BF16) && (__CUDACC_VER_MAJOR__ < 12) && !((__CUDA_ARCH__ >= 800) || !defined(__CUDA_ARCH__))
 __device__ floatX __ldcs(const floatX* address) {
     unsigned short bf = __ldcs(reinterpret_cast<const unsigned short*>(address));
     return __nv_bfloat16_raw{bf};
@@ -489,12 +491,12 @@ __global__ void encoder_forward_kernel3(floatX* out,
 }
 
 template <typename T>
-__device__ void atomicStochasticAdd(T* address, float val0, float val1, uint seed) {
+__device__ void atomicStochasticAdd(T* address, float val0, float val1, unsigned int seed) {
     static_assert(sizeof(T) == 2, "Only 16-bit atomicStochasticAdd supported.");
     float2 val = make_float2(val0, val1);
-    uint* address_as_uint = (uint*)address;
-    uint old = *address_as_uint, assumed;
-    uint random = Get2dNoiseUint(threadIdx.x, blockIdx.x, seed);
+    unsigned int* address_as_uint = (unsigned int*)address;
+    unsigned int old = *address_as_uint, assumed;
+    unsigned int random = Get2dNoiseUint(threadIdx.x, blockIdx.x, seed);
     do {
         assumed = old;
         float2 new_fp32 = make_float2((float)(reinterpret_cast<T*>(&old)[0]) + val.x,
@@ -502,17 +504,17 @@ __device__ void atomicStochasticAdd(T* address, float val0, float val1, uint see
         T new_rounded[2];
         stochastic_rounding(new_fp32.x, &new_rounded[0], random);
         stochastic_rounding(new_fp32.y, &new_rounded[1], random >> 16);
-        old = atomicCAS(address_as_uint, assumed, *(uint*)&new_rounded);
+        old = atomicCAS(address_as_uint, assumed, *(unsigned int*)&new_rounded);
     } while (assumed != old);
 }
-__device__ void atomicStochasticAdd(float* address, float val0, float val1, uint seed) {
+__device__ void atomicStochasticAdd(float* address, float val0, float val1, unsigned int seed) {
     atomicAdd(address, val0);
     atomicAdd(address + 1, val1);
 }
 
 __global__ void encoder_backward_kernel(floatX* dwte, floatX* dwpe,
                                         const floatX* dout, const int* inp,
-                                        int B, int T, int C, uint seed) {
+                                        int B, int T, int C, unsigned int seed) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     int N = B * T * C;
     idx *= 2; // 2 elements per thread
@@ -1090,7 +1092,7 @@ void encoder_forward(floatX* out,
 
 void encoder_backward(floatX* dwte, floatX* dwpe,
                     const floatX* dout, const int* inp,
-                    int B, int T, int C, uint seed) {
+                    int B, int T, int C, unsigned int seed) {
     NVTX_RANGE_FN();
     const int N = B * T * C;
     const int block_size = 256;

From 9b55ea8cf5b6ad5796b8b11f8c2d6fc4da2f6161 Mon Sep 17 00:00:00 2001
From: ademeure <arundemeure@gmail.com>
Date: Tue, 7 May 2024 00:11:42 +0100
Subject: [PATCH 030/172] More crazy optimisations to layernorm_backward,
 fused_classifier, and matmul_backward_bias.

---
 dev/cuda/classifier_fused.cu   | 222 ++++++++++++++++++++++++++----
 dev/cuda/layernorm_backward.cu | 142 +++++++++++++++++++-
 train_gpt2.cu                  | 238 ++++++++++++++++++++++-----------
 3 files changed, 499 insertions(+), 103 deletions(-)

diff --git a/dev/cuda/classifier_fused.cu b/dev/cuda/classifier_fused.cu
index 9202c2cee..522ac5135 100644
--- a/dev/cuda/classifier_fused.cu
+++ b/dev/cuda/classifier_fused.cu
@@ -21,6 +21,21 @@ nvcc -O3 --use_fast_math -lcublas -lcublasLt classifier_fused.cu -o classifier_f
 #include <cooperative_groups/reduce.h>
 #include "common.h"
 
+// todo - this file does not properly support anything but FP32
+// kernel 5 can be run in fp16/bf16 to test performance, but the outputs will be wrong
+#undef ENABLE_BF16
+#undef ENABLE_FP16
+#define ENABLE_BF16
+
+#if defined(ENABLE_BF16)
+typedef __nv_bfloat16 floatX;
+#elif defined(ENABLE_FP16)
+typedef half floatX;
+#else
+typedef float floatX;
+#endif
+typedef Packed128<floatX> x128;
+
 // ----------------------------------------------------------------------------
 // CPU code reference
 
@@ -382,18 +397,18 @@ __global__ void fused_classifier_kernel3(float* dlogits, float* losses, float* p
     }
 }
 
-__device__ SoftmaxParams prepare_softmax_blockwide2(int idx, const float* inp, int V, int P) {
+__device__ SoftmaxParams prepare_softmax_blockwide2(int idx, const floatX* inp, int V, int P) {
     // one row of inp, i.e. inp[idx, :] of shape (V,)
 
-    const float* x = inp + idx * P;
+    const floatX* x = inp + idx * P;
     float thread_maxval = -INFINITY;
     float thread_sumval = 0.0f;
     // do the loop in reverse to maximise probability of L2 cache hits
     // so even small L2s get some hits on the 2nd read of the same thread
-    for (int i = ceil_div(V, f128::size) + threadIdx.x - blockDim.x; i >= 0; i -= blockDim.x) {
-        f128 packed_x = load128cs(x + i * f128::size); // load and do not keep in cache
+    for (int i = ceil_div(V, x128::size) + threadIdx.x - blockDim.x; i >= 0; i -= blockDim.x) {
+        x128 packed_x = load128cs(x + i * x128::size); // load and do not keep in cache
         for(int k = 0; k < packed_x.size; ++k) {
-            if (i*f128::size+k >= V) {  // bounds checking against real V
+            if (i*x128::size+k >= V) {  // bounds checking against real V
                 continue;
             }
             float v = (float)packed_x[k];
@@ -436,9 +451,9 @@ __device__ SoftmaxParams prepare_softmax_blockwide2(int idx, const float* inp, i
     return SoftmaxParams{1.f / block_sumval, block_maxval};
 }
 
-// same as 2 but not using float4
-__global__ void fused_classifier_kernel4(float* dlogits, float* losses, float* probs,
-                                         const float* logits, const float* dlosses, const int* targets,
+// same as 2 but using x128
+__global__ void fused_classifier_kernel4(floatX* dlogits, floatX* losses, floatX* probs,
+                                         const floatX* logits, const floatX* dlosses, const int* targets,
                                          int B, int T, int V, int P) {
     int idx = blockIdx.x;
     int ix = targets[idx];
@@ -448,21 +463,21 @@ __global__ void fused_classifier_kernel4(float* dlogits, float* losses, float* p
 
     // calculate the probability needed for the loss and update (single-threaded)
     if(threadIdx.x == 0) {
-        float prob = expf(logits[idx * P + ix] - sp.Offset) * sp.Scale;
+        float prob = expf((float)logits[idx * P + ix] - sp.Offset) * sp.Scale;
         losses[idx] = -logf(prob);
     }
 
     // very sensible default for dlosses is 1/(B*T), which is the uniform loss
-    float dloss = dlosses != NULL ? dlosses[idx] : 1.0f / (B*T);
+    float dloss = dlosses != NULL ? (float)dlosses[idx] : 1.0f / (B*T);
     // calculate the gradients directly, saves bandwidth from probs during training
     // but also supports writing probs for inference-only and debugging
-    const float* logits_vec = logits + idx * P;
-    for (int i = threadIdx.x; i < ceil_div(V , f128::size); i += blockDim.x) {
+    const floatX* logits_vec = logits + idx * P;
+    for (int i = threadIdx.x; i < ceil_div(V , x128::size); i += blockDim.x) {
         // this is the 2nd read of logits after the one in prepare_softmax2
         // this data will never be needed again, so we reduce cache persistence
-        f128 packed_logits_vec = load128cs(logits_vec + i * f128::size); // load and do not keep in cache
-        f128 packed_probs;
-        f128 packed_dlogits;
+        x128 packed_logits_vec = load128cs(logits_vec + i * x128::size); // load and do not keep in cache
+        x128 packed_probs;
+        x128 packed_dlogits;
         for(int k = 0; k < packed_logits_vec.size; ++k) {
             int element = i*packed_logits_vec.size + k;
             if (element >= V) {  // bounds checking against real V
@@ -474,6 +489,7 @@ __global__ void fused_classifier_kernel4(float* dlogits, float* losses, float* p
             float indicator = (element == ix) ? 1.0f : 0.0f;
             packed_dlogits[k] = (prob - indicator) * dloss;
         }
+        // Note: missing .cs hint hurts our performance due to cache thrashing, fixed in kernel5
         store128(dlogits + idx * P + i * packed_logits_vec.size, packed_dlogits);
         if (probs != NULL) {
             store128(probs + idx * P + i * packed_logits_vec.size, packed_probs);
@@ -481,6 +497,143 @@ __global__ void fused_classifier_kernel4(float* dlogits, float* losses, float* p
     }
 }
 
+// todo - move to common.h - or ideally somewhere it's not duplicated between train & common?
+// requires all 32 threads in the warp to be active, but should work for any block size
+// uses non-dynamic shared memory so every call increases shared memory requirements by 128 bytes
+// the fact it's unique shared memory allows us to avoid an extra __syncthreads() call at the end
+// but if called inside a loop, the shared memory will be implicitly reused, so set final_sync to 1
+using reduction_func_t = float (*) (float);
+template<reduction_func_t warp_reduction>
+__device__ float blockReduce(float val, bool final_sync=false, float out_of_bounds=0.0f) {
+    // two reductions of up to 1024 threads:
+    // 1) inside warp (shuffle), 2) cross-warp (shared memory), 3) inside warp (shuffle)
+    __shared__ float shared_val[32];
+    const int lane_id = threadIdx.x % 32;
+    const int warp_id = threadIdx.x / 32;
+    const int num_warps = blockDim.x / 32;
+
+    float warp_val = warp_reduction(val);
+    if (lane_id == 0) { shared_val[warp_id] = warp_val; }
+    __syncthreads();
+    warp_val = (lane_id < num_warps) ? shared_val[lane_id] : out_of_bounds;
+    float block_val = warp_reduction(warp_val);
+
+    if (final_sync) {
+        __syncthreads(); // only needed in loops when effectively reusing shared memory etc.
+    }
+    return block_val;
+}
+
+__device__ SoftmaxParams prepare_softmax_blockwide3(int idx, const floatX* inp, int V, int P) {
+    // same but not float4
+    // one row of inp, i.e. inp[idx, :] of shape (V,)
+
+    const floatX* x = inp + idx * P;
+    float thread_maxval = -INFINITY;
+    float thread_sumval = 0.0f;
+    int i = (V+x128::size-1)/x128::size + threadIdx.x - blockDim.x;
+
+    // special-case loop to handle the unaligned elements at the end of the array
+    // this lets us skip the bounds check in the main loop below, which improves performance
+    while ((i+1)*x128::size > V) {
+        for(int k = 0; k < x128::size; ++k) {
+            if (i*x128::size+k >= V) {
+                break; // bounds checking against real V (rather than padded P)
+            }
+            float v = (float)x[i*x128::size+k];
+            float old_maxval = thread_maxval;
+            thread_maxval = fmaxf(thread_maxval, v);
+            thread_sumval *= expf((old_maxval - thread_maxval));
+            thread_sumval += expf(v - thread_maxval);
+        }
+        i -= blockDim.x;
+    }
+
+    // main loop for the bulk of the iterations (no bounds checking required!)
+    for (; i >= 0; i -= blockDim.x) {
+        x128 packed_x = load128(x + i * x128::size); // load and keep in cache until fused_classifier loop
+        for(int k = 0; k < x128::size; ++k) {
+            float v = (float)packed_x[k];
+            float old_maxval = thread_maxval;
+            thread_maxval = fmaxf(thread_maxval, v);
+            thread_sumval *= expf((old_maxval - thread_maxval));
+            thread_sumval += expf(v - thread_maxval);
+        }
+    }
+
+    // Block Max Reduction -> Maths -> Block Sum Reduction
+    float block_maxval = blockReduce<warpReduceMax>(thread_maxval, false, -FLT_MAX);
+    thread_sumval *= expf(thread_maxval - block_maxval);
+    float block_sumval = blockReduce<warpReduceSum>(thread_sumval);
+
+    // return the softmax parameters
+    return SoftmaxParams{1.f / block_sumval, block_maxval};
+}
+
+// will _update_ logits to logit gradients
+// uses template to decide whether to write logits and probs
+// split both loops in "multiple-of-x128-size" and "bounds-checked remainder" parts
+template <bool WriteLogits = true, bool WriteProbs = false>
+__global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS)
+                fused_classifier_kernel5(floatX* dlogits, floatX* losses, floatX* probs,
+                                         const floatX* logits, const floatX* dlosses, const int* targets,
+                                         int B, int T, int V, int P) {
+    int idx = blockIdx.x;
+    int ix = targets[idx];
+
+    // softmax (reading B * T * V, same logits read again below, hopefully still in cache)
+    SoftmaxParams sp = prepare_softmax_blockwide3(idx, logits, V, P);
+
+    // calculate the probability needed for the loss and update (single-threaded)
+    if(threadIdx.x == 0) {
+        float prob = expf((float)logits[idx * P + ix] - sp.Offset) * sp.Scale;
+        losses[idx] = (floatX)(-logf(prob));
+    }
+
+    // very sensible default for dlosses is 1/(B*T), which is the uniform loss
+    float dloss = (dlosses != NULL) ? (float)dlosses[idx] : 1.0f / (B*T);
+    // calculate the gradients directly, saves bandwidth from probs during training
+    // but also supports writing probs for inference-only and debugging
+    const floatX* logits_vec = logits + idx * P;
+    int i = threadIdx.x;
+    for (; i < V/x128::size; i += blockDim.x) {
+        // this is the 2nd read of logits after the one in prepare_softmax2
+        // it will be overwritten by the logits gradients which is when we reduce cache persistence
+        x128 packed_logits_vec = load128(logits_vec + i * x128::size); // rely on cs of store128cs
+        x128 packed_probs;
+        for(int k = 0; k < x128::size; ++k) {
+            int element = i*x128::size + k;
+            float prob = expf((float)packed_logits_vec[k] - sp.Offset) * sp.Scale;
+            packed_probs[k] = (floatX)prob;
+            float indicator = (element == ix) ? 1.0f : 0.0f;
+            packed_logits_vec[k] = (floatX)((prob - indicator) * dloss);
+        }
+        if (WriteLogits){
+            // reduce cache persistence for the overwritten logits
+            // to maximise probability that logits remain in cache between prepare_softmax and here
+            store128cs(dlogits + idx * P + i * x128::size, packed_logits_vec);
+        }
+        if (WriteProbs) {
+            store128(probs + idx * P + i * x128::size, packed_probs);
+        }
+    }
+
+    // handle remaining elements after the last multiple of x128::size
+    // e.g. if V = 8003, and x128::size = 8, we need to handle the last 3 elements
+    i *= x128::size;
+    for (; i < V; i++) {
+        float prob = expf((float)logits_vec[i] - sp.Offset) * sp.Scale;
+        float indicator = (i == ix) ? 1.0f : 0.0f;
+        float dlogit = (prob - indicator) * dloss;
+        if (WriteLogits){
+            __stcs(dlogits + idx * P + i, (floatX)dlogit);
+        }
+        if (WriteProbs) {
+            probs[idx * P + i] = (floatX)prob;
+        }
+    }
+}
+
 // ----------------------------------------------------------------------------
 // kernel launcher
 
@@ -519,7 +672,16 @@ void fused_classifier4(float* dlogits, float* losses,
                       int B, int T, int V, int P, int block_size) {
     const int N = B * T;
     const int grid_size = N;
-    fused_classifier_kernel4<<<grid_size, block_size>>>(dlogits, losses, NULL, logits, dlosses, targets, B, T, V, P);
+    fused_classifier_kernel4<<<grid_size, block_size>>>((floatX*)dlogits, (floatX*)losses, NULL, (floatX*)logits, (floatX*)dlosses, targets, B, T, V, P);
+    cudaCheck(cudaGetLastError());
+}
+
+void fused_classifier5(float* dlogits, float* losses,
+                      const float* logits, const float* dlosses, const int* targets,
+                      int B, int T, int V, int P, int block_size) {
+    const int N = B * T;
+    const int grid_size = N;
+    fused_classifier_kernel5<true,false><<<grid_size, block_size, 512>>>((floatX*)dlogits, (floatX*)losses, NULL, (floatX*)logits, (floatX*)dlosses, targets, B, T, V, P);
     cudaCheck(cudaGetLastError());
 }
 
@@ -539,6 +701,9 @@ void fused_classifier(int kernel_num, float* dlogits, float* losses,
         case 4:
             fused_classifier4(dlogits, losses, logits, dlosses, targets, B, T, V, P, block_size);
             break;
+        case 5:
+            fused_classifier5(dlogits, losses, logits, dlosses, targets, B, T, V, P, block_size);
+            break;
         default:
             printf("Invalid kernel number\n");
             exit(1);
@@ -606,17 +771,22 @@ int main(int argc, char **argv) {
     crossentropy_forward_cpu(losses, probs, targets, B, T, V);
     crossentropy_softmax_backward_cpu(dlogits, dlosses, probs, targets, B, T, V);
 
-    // time the kernel at different block sizes
-    for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
-        int block_size = block_sizes[j];
-        printf("Checking block size %d.\n", block_size);
-        fused_classifier(kernel_num, d_dlogits, d_losses, d_logits, d_dlosses, d_targets, B, T, V, P, block_size);
-        validate_result(d_losses, losses, "losses", B * T, 1e-4f);
-        // undo the padding before we can check for correctness
-        cudaCheck(cudaMemcpy2D(d_dlogits_no_pad, V * sizeof(float), d_dlogits, P * sizeof(float), V * sizeof(float), B * T, cudaMemcpyDeviceToDevice));
-        validate_result(d_dlogits_no_pad, dlogits, "dlogits", B * T * V, 1e-4f);
+#if defined(ENABLE_BF16) || defined(ENABLE_FP16)
+    if (kernel_num < 4) // kernel 4/5 + BF16 is only for testing performance, it doesn't do the format conversions yet etc...
+#endif
+    {
+        // time the kernel at different block sizes
+        for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
+            int block_size = block_sizes[j];
+            printf("Checking block size %d.\n", block_size);
+            fused_classifier(kernel_num, d_dlogits, d_losses, d_logits, d_dlosses, d_targets, B, T, V, P, block_size);
+            validate_result(d_losses, losses, "losses", B * T, 1e-4f);
+            // undo the padding before we can check for correctness
+            cudaCheck(cudaMemcpy2D(d_dlogits_no_pad, V * sizeof(float), d_dlogits, P * sizeof(float), V * sizeof(float), B * T, cudaMemcpyDeviceToDevice));
+            validate_result(d_dlogits_no_pad, dlogits, "dlogits", B * T * V, 1e-4f);
+        }
+        printf("All results match. Starting benchmarks.\n\n");
     }
-    printf("All results match. Starting benchmarks.\n\n");
 
     for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
         int block_size = block_sizes[j];
diff --git a/dev/cuda/layernorm_backward.cu b/dev/cuda/layernorm_backward.cu
index c1f01b0e6..1f432ba82 100644
--- a/dev/cuda/layernorm_backward.cu
+++ b/dev/cuda/layernorm_backward.cu
@@ -32,6 +32,7 @@ typedef half floatN;
 typedef float floatX;
 typedef float floatN;
 #endif
+typedef Packed128<floatX> x128;
 
 // ----------------------------------------------------------------------------
 // CPU code reference
@@ -125,7 +126,7 @@ void layernorm_backward_cpu(float* dinp, float* dweight, float* dbias,
 // GPU kernels
 
 // GPU helper functions for atomicAdd on smaller than 32-bit types
-__device__ floatX warpReduceSum(floatX val) {
+__device__ float warpReduceSum(float val) {
     for (int offset = 16; offset > 0; offset /= 2) {
         val += __shfl_xor_sync(0xFFFFFFFF, val, offset);
     }
@@ -751,6 +752,128 @@ __global__ void layernorm_backward_kernel7(floatX* dinp, floatX* dweight, floatX
     }
 }
 
+__global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS)
+                layernorm_backward_kernel8(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
+                                            const floatX* dout, const floatX* inp, const floatX* weight,
+                                            const floatX* mean, const floatX* rstd,
+                                            int B, int T, int C) {
+    extern __shared__ float shared[]; // size = 2 * C + 1
+    int warpId = threadIdx.x / warpSize; // warp index within a block
+    int warpsInBlock = blockDim.x / warpSize; //number of warps in block
+    int baseIdx = blockIdx.x * warpsInBlock + warpId;
+    int warpThreadIdx = threadIdx.x % warpSize; // Thread index within the warp
+    int warpsInGrid = gridDim.x * warpsInBlock;
+    int C_per_iteration = warpSize * x128::size;
+    int iterations_C = C / C_per_iteration;
+
+    // the first half of shared memory is bias, second is weight
+    float* dbias_shared = shared;
+    float* dweight_shared = shared + C;
+
+    // init shared memory to zero
+    for(int i = threadIdx.x; i < C; i+= blockDim.x){
+       dbias_shared[i] = 0.0f;
+       dweight_shared[i] = 0.0f;
+    }
+    unsigned int *tmp_flag = (unsigned int*)(shared + C*2);
+    __syncthreads();
+
+    for (int idx = baseIdx; idx < B * T; idx += warpsInGrid) {
+        int b = idx / T;
+        int t = idx % T;
+
+        const floatX* dout_bt = dout + b * T * C + t * C;
+        const floatX* inp_bt = inp + b * T * C + t * C;
+        floatX* dinp_bt = dinp + b * T * C + t * C;
+        const float mean_bt = (float)mean[b * T + t];
+        const float rstd_bt = (float)rstd[b * T + t];
+
+        // first: two reduce operations
+        float dnorm_mean = 0.0f;
+        float dnorm_norm_mean = 0.0f;
+        for (int i = warpThreadIdx * x128::size; i < C; i += warpSize * x128::size) {
+            x128 dout128_i   = load128(dout_bt + i);
+            x128 inp128_i    = load128(inp_bt  + i);
+            x128 weight128_i = load128(weight  + i);
+            for (int k = 0; k < x128::size; k++) {
+                float norm_bti = ((float)inp128_i[k] - mean_bt) * rstd_bt;
+                float dnorm_i = (float)weight128_i[k] * (float)dout128_i[k];
+                dnorm_mean += dnorm_i;
+                dnorm_norm_mean += dnorm_i * norm_bti;
+            }
+        }
+        dnorm_mean = warpReduceSum(dnorm_mean) / C;
+        dnorm_norm_mean = warpReduceSum(dnorm_norm_mean) / C;
+
+        // now iterate again and accumulate all the gradients
+        // unfortunately we cannot use the same index for x128 arrays and shared memory
+        // as atomics can only be 32-bit rather than 128-bit (at least pre-SM90/Hopper)
+        // so this would result in an 8-way bank conflict, and kill performance
+        // so instead, we use a shared memory friendly index, and reorder before the final write
+        for (int i = 0; i < iterations_C; i++) {
+            int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration);
+            int shared_index = warpThreadIdx + (i * C_per_iteration);
+            x128 dout128   = load128cs(dout_bt + global_index);
+            x128 inp128    = load128cs(inp_bt  + global_index);
+            x128 dinp128   = load128(dinp_bt   + global_index);
+            x128 weight128 = load128(weight    + global_index);
+
+            for (int x = 0; x < x128::size; x++) {
+                float dout_i = (float)dout128[x];
+                float norm_bti = ((float)inp128[x] - mean_bt) * rstd_bt;
+                float dnorm_i = (float)weight128[x] * dout_i;
+                // gradient contribution to bias (using shared memory friendly index)
+                atomicAdd(&dbias_shared[shared_index + x*warpSize], dout_i);
+                // gradient contribution to weight (using shared memory friendly index)
+                atomicAdd(&dweight_shared[shared_index + x*warpSize], norm_bti * dout_i);
+                // gradient contribution to input
+                float dval = 0.0f;
+                dval += dnorm_i; // term 1
+                dval -= dnorm_mean; // term 2
+                dval -= norm_bti * dnorm_norm_mean; // term 3
+                dval *= rstd_bt; // final scale
+                dinp128[x] = (floatX)((float)dinp128[x] + dval);
+            }
+            // cache in L2 as this is read by the next kernel, but bypass L1 to minimise thrashing
+            store128cg(dinp_bt + global_index, dinp128);
+        }
+    }
+    // Accumulate into a FP32 scratchpad
+    // BF16 atomics are potentially much slower... and this is more precise!
+    // todo - could potentially avoid the extra copy if floatX is FP32, fairly negligible though
+    __syncthreads();
+    float* scratch_dbias = scratch;
+    float* scratch_dweight = scratch + C;
+    unsigned int* scratchFlag = (unsigned int*)(scratch + (2 * C));
+    for(int i = threadIdx.x; i < C; i+= blockDim.x) {
+        // global atomics in the same "shared memory banking friendly" order
+        atomicAdd(&scratch_dbias[i], dbias_shared[i]);
+        atomicAdd(&scratch_dweight[i], dweight_shared[i]);
+    }
+    __syncthreads();
+    if (threadIdx.x == 0) {
+        *tmp_flag = atomicInc(scratchFlag, gridDim.x);
+    }
+    __syncthreads();
+    if (*tmp_flag == gridDim.x-1) {
+        for (int i = warpId; i < iterations_C; i += warpsInBlock) {
+            // reorder from atomic/shared memory-friendly index to real global memory index
+            // and convert from float/FP32 to floatX/BF16 for the final write
+            int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration);
+            int shared_index = warpThreadIdx + (i * C_per_iteration);
+
+            x128 dbias128;
+            x128 dweight128;
+            for (int x = 0; x < x128::size; x++) {
+                dbias128[x] = (floatX)scratch_dbias[shared_index + x*warpSize];
+                dweight128[x] = (floatX)scratch_dweight[shared_index + x*warpSize];
+            }
+            store128(dbias + global_index, dbias128);
+            store128(dweight + global_index, dweight128);
+        }
+    }
+}
+
 // ----------------------------------------------------------------------------
 // kernel launchers
 
@@ -828,6 +951,20 @@ void layernorm_backward7(Tdinp* dinp, Tparams* dweight, Tparams* dbias, float* s
         layernorm_backward_kernel7<<<grid_size, block_size, shared_mem_size>>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C);
 }
 
+template <typename Tdinp, typename Tparams, typename Tdout, typename Trest>
+void layernorm_backward8(Tdinp* dinp, Tparams* dweight, Tparams* dbias, float* scratch,
+                        const Tdout* dout, const Trest* inp, const Tparams* weight, const Trest* mean, const Trest* rstd,
+                        int B, int T, int C, int block_size) {
+        const int grid_size = (1024/block_size) * cuda_num_SMs;
+        size_t shared_mem_size = (2 * C + 1) * sizeof(float);
+
+        // Including this as part of the timing until we can parallelise it
+        // It should fully hide the cost and improve kernel perf by >5% if done in parallel using CUDA streams
+        cudaMemset(scratch, 0, (1 + 2 * C) * sizeof(float));
+
+        layernorm_backward_kernel8<<<grid_size, block_size, shared_mem_size>>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C);
+}
+
 // kernel version dispatch
 void layernorm_backward(int kernel_num,
                         floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
@@ -860,6 +997,9 @@ void layernorm_backward(int kernel_num,
         case 7:
             layernorm_backward7(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C, block_size);
             break;
+        case 8:
+            layernorm_backward8(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C, block_size);
+            break;
     default:
             printf("Invalid kernel number\n");
             exit(1);
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 86ceb5caa..c64cf7199 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -108,6 +108,14 @@ class NvtxRange {
 };
 #define NVTX_RANGE_FN() NvtxRange nvtx_range(__FUNCTION__)
 
+// try to make sure that 2 blocks fit on A100/H100 to maximise latency tolerance
+// this needs to be defines rather than queried to be used for __launch_bounds__
+#if __CUDA_ARCH__ == 800 || __CUDA_ARCH__ >= 900
+#define MAX_1024_THREADS_BLOCKS 2
+#else
+#define MAX_1024_THREADS_BLOCKS 1
+#endif
+
 // cuBLAS workspace. Hardcoding to 32MiB but only Hopper needs 32, for others 4 is OK
 const size_t cublaslt_workspace_size = 32 * 1024 * 1024;
 void* cublaslt_workspace = NULL;
@@ -271,6 +279,11 @@ template<class ElementType>
 __device__ void store128cs(ElementType* target, Packed128<ElementType> value) {
     __stcs(reinterpret_cast<int4*>(target), value.get_bits());
 }
+// store a Packed128 to an aligned memory address while caching in L2 but bypassing L1
+template<class ElementType>
+__device__ void store128cg(ElementType* target, Packed128<ElementType> value) {
+    __stcg(reinterpret_cast<int4*>(target), value.get_bits());
+}
 
 // short-form typedefs
 typedef Packed128<float> f128;
@@ -772,7 +785,7 @@ __global__ void gelu_backward_kernel(floatX* dinp, const floatX* inp, const floa
     store128(dinp + idx, packed_dinp);
 }
 
-__global__ void matmul_backward_bias_kernel6(float* dbias, const floatX* dout, int B, int T, int OC) {
+__global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, int B, int T, int OC) {
     // note: this kernel reads in floatX, but it writes to float!
     // this is because we're using atomics, which are super slow in < fp32 precision on < H100 GPUs
     // so the trick is do fp32 atomics to a buffer, and then copy_and_cast the result to floatX
@@ -793,36 +806,48 @@ __global__ void matmul_backward_bias_kernel6(float* dbias, const floatX* dout, i
         accumulators[k] = 0.0f;
     }
     int thread_id = threadIdx.y * block_size_x + threadIdx.x;
-    for (int idx = thread_id; idx < OC_per_warp; idx += block_size) {
-        shared[idx] = 0.0f;
+    for (int i = thread_id; i < OC_per_warp; i += block_size) {
+        shared[i] = 0.0f;
     }
     __syncthreads();
-    for (int idx = blockIdx.y*block_size_y + threadIdx.y; idx < B * T; idx += gridDim.y*block_size_y) {
-        x128 packed_dout = load128(dout + global_oc + idx*OC);
+    for (int i = blockIdx.y*block_size_y + threadIdx.y; i < B * T; i += gridDim.y*block_size_y) {
+        x128 packed_dout = load128(dout + global_oc + i*OC);
         for (int k = 0; k < x128::size; k++) {
             accumulators[k] += (float)packed_dout[k];
         }
     }
+    // we need to avoid shared memory bank conflicts for the atomicAdd to maximise performance
+    // so we accumulate in a conflict-free order, then reorder to match the global memory order
     for (int k = 0; k < x128::size; k++) {
-        atomicAdd(shared + local_oc + k, accumulators[k]);
+        atomicAdd(shared + threadIdx.x + (k * block_size_x), accumulators[k]);
     }
+    if (threadIdx.y >= x128::size) { return; } // only need this many warps to reorder the data
     __syncthreads();
-    if (threadIdx.y == 0) {
-        for (int idx = threadIdx.x; idx < OC_per_warp; idx += block_size_x) {
-            atomicAdd(dbias + idx + blockIdx.x*OC_per_warp, shared[idx]);
-        }
-    }
+    // read the accumulated values in the conflict-free order
+    int i = threadIdx.x + (threadIdx.y * block_size_x);
+    float tmp = shared[i];
+    __syncthreads();
+    // write them back to shared memory in the global memory order
+    // 8-way bank conflict for BF16 x128, but only 8x per threadblock (rather than 8x per warp)
+    shared[local_oc + threadIdx.y] = tmp;
+    __syncthreads();
+    // now we do a perfectly coalesced atomic add to global memory (1x 128-byte cacheline per warp)
+    atomicAdd(dbias + i + blockIdx.x*OC_per_warp, shared[i]);
 }
 
-__global__ void layernorm_backward_kernel7(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
-                        const floatX* dout, const floatX* inp, const floatX* weight, const floatX* mean, const floatX* rstd,
-                        int B, int T, int C) {
+__global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS)
+                layernorm_backward_kernel8(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
+                                            const floatX* dout, const floatX* inp, const floatX* weight,
+                                            const floatX* mean, const floatX* rstd,
+                                            int B, int T, int C) {
     extern __shared__ float shared[]; // size = 2 * C + 1
     int warpId = threadIdx.x / warpSize; // warp index within a block
     int warpsInBlock = blockDim.x / warpSize; //number of warps in block
     int baseIdx = blockIdx.x * warpsInBlock + warpId;
     int warpThreadIdx = threadIdx.x % warpSize; // Thread index within the warp
     int warpsInGrid = gridDim.x * warpsInBlock;
+    int C_per_iteration = warpSize * x128::size;
+    int iterations_C = C / C_per_iteration;
 
     // the first half of shared memory is bias, second is weight
     float* dbias_shared = shared;
@@ -849,56 +874,85 @@ __global__ void layernorm_backward_kernel7(floatX* dinp, floatX* dweight, floatX
         // first: two reduce operations
         float dnorm_mean = 0.0f;
         float dnorm_norm_mean = 0.0f;
-        for (int i = warpThreadIdx; i < C; i  += warpSize) {
-            float norm_bti = ((float)inp_bt[i] - mean_bt) * rstd_bt;
-            float dnorm_i = (float)weight[i] * (float)dout_bt[i];
-            dnorm_mean += dnorm_i;
-            dnorm_norm_mean += dnorm_i * norm_bti;
+        for (int i = warpThreadIdx * x128::size; i < C; i += warpSize * x128::size) {
+            x128 dout128_i   = load128(dout_bt + i);
+            x128 inp128_i    = load128(inp_bt  + i);
+            x128 weight128_i = load128(weight  + i);
+            for (int k = 0; k < x128::size; k++) {
+                float norm_bti = ((float)inp128_i[k] - mean_bt) * rstd_bt;
+                float dnorm_i = (float)weight128_i[k] * (float)dout128_i[k];
+                dnorm_mean += dnorm_i;
+                dnorm_norm_mean += dnorm_i * norm_bti;
+            }
         }
         dnorm_mean = warpReduceSum(dnorm_mean) / C;
         dnorm_norm_mean = warpReduceSum(dnorm_norm_mean) / C;
 
         // now iterate again and accumulate all the gradients
-        // todo - use x128 for this loop to improve performance
-        for (int i = warpThreadIdx; i < C; i += warpSize) {
-            float dout_i = (float)__ldcs(&dout_bt[i]);
-            float norm_bti = ((float)__ldcs(&inp_bt[i]) - mean_bt) * rstd_bt;
-            float dnorm_i = (float)weight[i] * dout_i;
-            // gradient contribution to bias
-            atomicAdd(&dbias_shared[i], dout_i);
-            // gradient contribution to weight
-            atomicAdd(&dweight_shared[i], norm_bti * dout_i);
-            // gradient contribution to input
-            float dval = 0.0f;
-            dval += dnorm_i; // term 1
-            dval -= dnorm_mean; // term 2
-            dval -= norm_bti * dnorm_norm_mean; // term 3
-            dval *= rstd_bt; // final scale
-            dinp_bt[i] = (floatX)((float)dinp_bt[i] + dval);
+        // unfortunately we cannot use the same index for x128 arrays and shared memory
+        // as atomics can only be 32-bit rather than 128-bit (at least pre-SM90/Hopper)
+        // so this would result in an 8-way bank conflict, and kill performance
+        // so instead, we use a shared memory friendly index, and reorder before the final write
+        for (int i = 0; i < iterations_C; i++) {
+            int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration);
+            int shared_index = warpThreadIdx + (i * C_per_iteration);
+            x128 dout128   = load128cs(dout_bt + global_index);
+            x128 inp128    = load128cs(inp_bt  + global_index);
+            x128 dinp128   = load128(dinp_bt   + global_index);
+            x128 weight128 = load128(weight    + global_index);
+
+            for (int x = 0; x < x128::size; x++) {
+                float dout_i = (float)dout128[x];
+                float norm_bti = ((float)inp128[x] - mean_bt) * rstd_bt;
+                float dnorm_i = (float)weight128[x] * dout_i;
+                // gradient contribution to bias (using shared memory friendly index)
+                atomicAdd(&dbias_shared[shared_index + x*warpSize], dout_i);
+                // gradient contribution to weight (using shared memory friendly index)
+                atomicAdd(&dweight_shared[shared_index + x*warpSize], norm_bti * dout_i);
+                // gradient contribution to input
+                float dval = 0.0f;
+                dval += dnorm_i; // term 1
+                dval -= dnorm_mean; // term 2
+                dval -= norm_bti * dnorm_norm_mean; // term 3
+                dval *= rstd_bt; // final scale
+                dinp128[x] = (floatX)((float)dinp128[x] + dval);
+            }
+            // cache in L2 as this is read by the next kernel, but bypass L1 to minimise thrashing
+            store128cg(dinp_bt + global_index, dinp128);
         }
     }
-
     // Accumulate into a FP32 scratchpad
     // BF16 atomics are potentially much slower... and this is more precise!
-    // todo - could avoid the extra copy if floatX is FP32, fairly negligible though
+    // todo - could potentially avoid the extra copy if floatX is FP32, fairly negligible though
     __syncthreads();
     float* scratch_dbias = scratch;
     float* scratch_dweight = scratch + C;
     unsigned int* scratchFlag = (unsigned int*)(scratch + (2 * C));
     for(int i = threadIdx.x; i < C; i+= blockDim.x) {
+        // global atomics in the same "shared memory banking friendly" order
         atomicAdd(&scratch_dbias[i], dbias_shared[i]);
         atomicAdd(&scratch_dweight[i], dweight_shared[i]);
     }
     __syncthreads();
     if (threadIdx.x == 0) {
-        *tmp_flag = atomicAdd(scratchFlag, 1);
+        *tmp_flag = atomicInc(scratchFlag, gridDim.x);
     }
     __syncthreads();
     if (*tmp_flag == gridDim.x-1) {
-        for(int i = threadIdx.x; i < C; i+= blockDim.x) {
-            // todo - potentially do stochastic rounding here as well
-            dbias[i] = (floatX)scratch_dbias[i];
-            dweight[i] = (floatX)scratch_dweight[i];
+        for (int i = warpId; i < iterations_C; i += warpsInBlock) {
+            // reorder from atomic/shared memory-friendly index to real global memory index
+            // and convert from float/FP32 to floatX/BF16 for the final write
+            int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration);
+            int shared_index = warpThreadIdx + (i * C_per_iteration);
+
+            x128 dbias128;
+            x128 dweight128;
+            for (int x = 0; x < x128::size; x++) {
+                dbias128[x] = (floatX)scratch_dbias[shared_index + x*warpSize];
+                dweight128[x] = (floatX)scratch_dweight[shared_index + x*warpSize];
+            }
+            store128(dbias + global_index, dbias128);
+            store128(dweight + global_index, dweight128);
         }
     }
 }
@@ -982,21 +1036,35 @@ struct SoftmaxParams {
     float Offset;
 };
 
-__device__ SoftmaxParams prepare_softmax_blockwide(int idx, const floatX* inp, int V, int P) {
+__device__ SoftmaxParams prepare_softmax_blockwide3(int idx, const floatX* inp, int V, int P) {
     // same but not float4
     // one row of inp, i.e. inp[idx, :] of shape (V,)
 
     const floatX* x = inp + idx * P;
     float thread_maxval = -INFINITY;
     float thread_sumval = 0.0f;
-    // do the loop in reverse to maximise probability of L2 cache hits
-    // so even small L2s get some hits on the 2nd read of the same thread
-    for (int i = (V+x128::size-1)/x128::size + threadIdx.x - blockDim.x; i >= 0; i -= blockDim.x) {
-        x128 packed_x = load128(x + i * x128::size); // try to keep in cache until next read
-        for(int k = 0; k < packed_x.size; ++k) {
-            if (i*x128::size+k >= V) {  // bounds checking against real V
-                continue;
+    int i = (V+x128::size-1)/x128::size + threadIdx.x - blockDim.x;
+
+    // special-case loop to handle the unaligned elements at the end of the array
+    // this lets us skip the bounds check in the main loop below, which improves performance
+    while ((i+1)*x128::size > V) {
+        for(int k = 0; k < x128::size; ++k) {
+            if (i*x128::size+k >= V) {
+                break; // bounds checking against real V (rather than padded P)
             }
+            float v = (float)x[i*x128::size+k];
+            float old_maxval = thread_maxval;
+            thread_maxval = fmaxf(thread_maxval, v);
+            thread_sumval *= expf((old_maxval - thread_maxval));
+            thread_sumval += expf(v - thread_maxval);
+        }
+        i -= blockDim.x;
+    }
+
+    // main loop for the bulk of the iterations (no bounds checking required!)
+    for (; i >= 0; i -= blockDim.x) {
+        x128 packed_x = load128(x + i * x128::size); // load and keep in cache until fused_classifier loop
+        for(int k = 0; k < x128::size; ++k) {
             float v = (float)packed_x[k];
             float old_maxval = thread_maxval;
             thread_maxval = fmaxf(thread_maxval, v);
@@ -1006,7 +1074,7 @@ __device__ SoftmaxParams prepare_softmax_blockwide(int idx, const floatX* inp, i
     }
 
     // Block Max Reduction -> Maths -> Block Sum Reduction
-    float block_maxval = blockReduce<warpReduceMax>(thread_maxval);
+    float block_maxval = blockReduce<warpReduceMax>(thread_maxval, false, -INFINITY);
     thread_sumval *= expf(thread_maxval - block_maxval);
     float block_sumval = blockReduce<warpReduceSum>(thread_sumval);
 
@@ -1014,16 +1082,19 @@ __device__ SoftmaxParams prepare_softmax_blockwide(int idx, const floatX* inp, i
     return SoftmaxParams{1.f / block_sumval, block_maxval};
 }
 
-// same as 2 but not using float4 (see dev/cuda/classifier_fused.cu)
 // will _update_ logits to logit gradients
-__global__ void fused_classifier_kernel3(floatX* logits, floatX* losses, floatX* probs,
+// uses template to decide whether to write logits and probs
+// split both loops in "multiple-of-x128-size" and "bounds-checked remainder" parts
+template <bool WriteLogits = true, bool WriteProbs = false>
+__global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS)
+                fused_classifier_kernel5(floatX* logits, floatX* losses, floatX* probs,
                                          const floatX* dlosses, const int* targets,
                                          int B, int T, int V, int P) {
     int idx = gridDim.x - (blockIdx.x+1); // reverse order for cache hits on matmul data
     int ix = targets[idx];
 
     // softmax (reading B * T * V, same logits read again below, hopefully still in cache)
-    SoftmaxParams sp = prepare_softmax_blockwide(idx, logits, V, P);
+    SoftmaxParams sp = prepare_softmax_blockwide3(idx, logits, V, P);
 
     // calculate the probability needed for the loss and update (single-threaded)
     if(threadIdx.x == 0) {
@@ -1036,28 +1107,41 @@ __global__ void fused_classifier_kernel3(floatX* logits, floatX* losses, floatX*
     // calculate the gradients directly, saves bandwidth from probs during training
     // but also supports writing probs for inference-only and debugging
     const floatX* logits_vec = logits + idx * P;
-    for (int i = threadIdx.x; i < (V+x128::size-1)/x128::size; i += blockDim.x) {
+    int i = threadIdx.x;
+    for (; i < V/x128::size; i += blockDim.x) {
         // this is the 2nd read of logits after the one in prepare_softmax2
-        // this data will never be needed again, so we reduce cache persistence
-        x128 packed_logits_vec = load128cs(logits_vec + i * x128::size); // load and do not keep in cache
+        // it will be overwritten by the logits gradients which is when we reduce cache persistence
+        x128 packed_logits_vec = load128(logits_vec + i * x128::size); // rely on cs of store128cs
         x128 packed_probs;
-        x128 packed_logits;
-        for(int k = 0; k < packed_logits_vec.size; ++k) {
-            int element = i*packed_logits_vec.size + k;
-            if (element >= V) {  // bounds checking against real V
-                continue;
-            }
-            float v = (float)packed_logits_vec[k];
-            float prob = expf(v - sp.Offset) * sp.Scale;
+        for(int k = 0; k < x128::size; ++k) {
+            int element = i*x128::size + k;
+            float prob = expf((float)packed_logits_vec[k] - sp.Offset) * sp.Scale;
             packed_probs[k] = (floatX)prob;
             float indicator = (element == ix) ? 1.0f : 0.0f;
-            packed_logits[k] = (floatX)((prob - indicator) * dloss);
+            packed_logits_vec[k] = (floatX)((prob - indicator) * dloss);
         }
-        if (logits != NULL){
-            store128(logits + idx * P + i * packed_logits_vec.size, packed_logits);
+        if (WriteLogits){
+            // reduce cache persistence for the overwritten logits
+            // to maximise probability that logits remain in cache between prepare_softmax and here
+            store128cs(logits + idx * P + i * x128::size, packed_logits_vec);
         }
-        if (probs != NULL) {
-            store128(probs + idx * P + i * packed_logits_vec.size, packed_probs);
+        if (WriteProbs) {
+            store128(probs + idx * P + i * x128::size, packed_probs);
+        }
+    }
+
+    // handle remaining elements after the last multiple of x128::size
+    // e.g. if V = 8003, and x128::size = 8, we need to handle the last 3 elements
+    i *= x128::size;
+    for (; i < V; i++) {
+        float prob = expf((float)logits_vec[i] - sp.Offset) * sp.Scale;
+        float indicator = (i == ix) ? 1.0f : 0.0f;
+        float dlogit = (prob - indicator) * dloss;
+        if (WriteLogits){
+            __stcs(logits + idx * P + i, (floatX)dlogit);
+        }
+        if (WriteProbs) {
+            probs[idx * P + i] = (floatX)prob;
         }
     }
 }
@@ -1286,9 +1370,10 @@ void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias,
                                      / (block_size * grid_size_x)); // full GPU!
 
         assert((OC % OC_per_warp) == 0); // there is no bounds checking in the kernel to maximise performance
+        assert(block_size_y >= x128::size); // part of the kernel assumes this is large enough to avoid loops
 
         cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float), main_stream);
-        matmul_backward_bias_kernel6<<<dim3(grid_size_x, grid_size_y),
+        matmul_backward_bias_kernel7<<<dim3(grid_size_x, grid_size_y),
                                        dim3(block_size_x, block_size_y),
                                        OC_per_warp * sizeof(float), main_stream>>>(dbias_buffer, dout, B, T, OC);
         cast_and_add_kernel<<<CEIL_DIV(OC, 256), 256, 0, main_stream>>>(dbias, dbias_buffer, OC);
@@ -1310,14 +1395,15 @@ void layernorm_backward(floatX* dinp, floatX* dweight, floatX* dbias, float* scr
                         int B, int T, int C) {
     NVTX_RANGE_FN();
     const int block_size = 1024;
-    const int grid_size = deviceProp.multiProcessorCount;
+    const int grid_size = MAX_1024_THREADS_BLOCKS * deviceProp.multiProcessorCount;
     size_t shared_mem_size = (2 * C + 1) * sizeof(float);
 
     cudaMemsetAsync(scratch, 0, (2 * C + 1) * sizeof(float), main_stream);
-    layernorm_backward_kernel7<<<grid_size, block_size, shared_mem_size, main_stream>>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C);
+    layernorm_backward_kernel8<<<grid_size, block_size, shared_mem_size, main_stream>>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C);
     cudaCheck(cudaGetLastError());
 }
 
+
 // the sequence of transformations in this compound op is:
 // inp (B,T,3C) -> qkvr (B,T,3C) -> preatt (B,NH,T,T) -> att (B,NH,T,T) -> vaccum (B,T,C) -> out (B,T,C)
 void attention_backward(floatX* dinp, floatX* dqkvr, floatX* dpreatt, floatX* datt, floatX* scratch,
@@ -1370,14 +1456,14 @@ void attention_backward(floatX* dinp, floatX* dqkvr, floatX* dpreatt, floatX* da
 
 // replaces logits with logit gradients
 template <typename Type>
-void fused_classifier3(Type* logits, Type* losses,
+void fused_classifier(Type* logits, Type* losses,
                       const Type* dlosses, const int* targets,
                       int B, int T, int V, int P) {
     NVTX_RANGE_FN();
     const int block_size = 1024;
     const int N = B * T;
     const int grid_size = N;
-    fused_classifier_kernel3<<<grid_size, block_size, 512, main_stream>>>(logits, losses, (Type*)NULL, dlosses, targets, B, T, V, P);
+    fused_classifier_kernel5<<<grid_size, block_size, 512, main_stream>>>(logits, losses, (Type*)NULL, dlosses, targets, B, T, V, P);
     cudaCheck(cudaGetLastError());
 }
 
@@ -1840,7 +1926,7 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo
         cudaStreamWaitEvent(main_stream, parallel_events[0], 0);
         // fused classifier: does the forward pass and first part of the backward pass
         // we're passing dlosses = NULL, which will default them to 1.0f/(B*T), i.e. uniform loss
-        fused_classifier3(acts.output, model->cpu_losses, (floatX*)NULL, model->targets, B, T, V, Vp);
+        fused_classifier(acts.output, model->cpu_losses, (floatX*)NULL, model->targets, B, T, V, Vp);
 
         // the GPU now writes the losses directly to the CPU buffer allocated with cudaMallocHost()
         // we accumulate cpu_losses at the end of gpt2_backward() waiting on this event

From 1ea7f9bf2595cec705f8d28d685e669a46d58f8e Mon Sep 17 00:00:00 2001
From: ademeure <arundemeure@gmail.com>
Date: Tue, 7 May 2024 00:54:50 +0100
Subject: [PATCH 031/172] tiny irrelevant optimisation to final unaligned
 fused_classifier loop + add missing common.h changes

---
 dev/cuda/classifier_fused.cu |  7 +++----
 dev/cuda/common.h            | 16 +++++++++++++---
 train_gpt2.cu                | 15 +++++++--------
 3 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/dev/cuda/classifier_fused.cu b/dev/cuda/classifier_fused.cu
index 522ac5135..55f0d44cb 100644
--- a/dev/cuda/classifier_fused.cu
+++ b/dev/cuda/classifier_fused.cu
@@ -595,8 +595,7 @@ __global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS)
     // calculate the gradients directly, saves bandwidth from probs during training
     // but also supports writing probs for inference-only and debugging
     const floatX* logits_vec = logits + idx * P;
-    int i = threadIdx.x;
-    for (; i < V/x128::size; i += blockDim.x) {
+    for (int i = threadIdx.x; i < V/x128::size; i += blockDim.x) {
         // this is the 2nd read of logits after the one in prepare_softmax2
         // it will be overwritten by the logits gradients which is when we reduce cache persistence
         x128 packed_logits_vec = load128(logits_vec + i * x128::size); // rely on cs of store128cs
@@ -620,8 +619,8 @@ __global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS)
 
     // handle remaining elements after the last multiple of x128::size
     // e.g. if V = 8003, and x128::size = 8, we need to handle the last 3 elements
-    i *= x128::size;
-    for (; i < V; i++) {
+    int unaligned_start = V & ~(x128::size - 1); // round down to multiple of x128::size
+    for (int i = threadIdx.x + unaligned_start; i < V; i++) {
         float prob = expf((float)logits_vec[i] - sp.Offset) * sp.Scale;
         float indicator = (i == ix) ? 1.0f : 0.0f;
         float dlogit = (prob - indicator) * dloss;
diff --git a/dev/cuda/common.h b/dev/cuda/common.h
index 77e012fcd..63d0e1de1 100644
--- a/dev/cuda/common.h
+++ b/dev/cuda/common.h
@@ -48,6 +48,14 @@ int cuda_arch_minor = 0;
 int cuda_num_SMs = 0; // for persistent threads where we want 1 threadblock per SM
 int cuda_threads_per_SM = 0;    // needed to calculate how many blocks to launch to fill up the GPU
 
+// ----------------------------------------------------------------------------
+// to make sure that 2 blocks fit on A100/H100 to maximise latency tolerance
+#if __CUDA_ARCH__ == 800 || __CUDA_ARCH__ >= 900
+#define MAX_1024_THREADS_BLOCKS 2
+#else
+#define MAX_1024_THREADS_BLOCKS 1
+#endif
+
 // ----------------------------------------------------------------------------
 // Packed128 data structure, which forces the compiler to use 128-bit loads/stores
 // in GPUs that support (the LDG.128 and STS.128 instructions)
@@ -88,24 +96,26 @@ template<class ElementType>
 __device__ Packed128<ElementType> load128(const ElementType* address) {
     return Packed128<ElementType>{*reinterpret_cast<const int4*>(address)};
 }
-
 // load a Packed128 from an aligned memory address with streaming cache hint
 template<class ElementType>
 __device__ Packed128<ElementType> load128cs(const ElementType* address) {
     return Packed128<ElementType>{__ldcs(reinterpret_cast<const int4*>(address))};
 }
-
 // store a Packed128 to an aligned memory address
 template<class ElementType>
 __device__ void store128(ElementType* target, Packed128<ElementType> value) {
     *reinterpret_cast<int4*>(target) = value.get_bits();
 }
-
 // store a Packed128 to an aligned memory address with streaming cache hint
 template<class ElementType>
 __device__ void store128cs(ElementType* target, Packed128<ElementType> value) {
     __stcs(reinterpret_cast<int4*>(target), value.get_bits());
 }
+// store a Packed128 to an aligned memory address while caching in L2 but bypassing L1
+template<class ElementType>
+__device__ void store128cg(ElementType* target, Packed128<ElementType> value) {
+    __stcg(reinterpret_cast<int4*>(target), value.get_bits());
+}
 
 // ----------------------------------------------------------------------------
 // random utils
diff --git a/train_gpt2.cu b/train_gpt2.cu
index c64cf7199..3a339d56d 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -806,12 +806,12 @@ __global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, i
         accumulators[k] = 0.0f;
     }
     int thread_id = threadIdx.y * block_size_x + threadIdx.x;
-    for (int i = thread_id; i < OC_per_warp; i += block_size) {
-        shared[i] = 0.0f;
+    for (int idx = thread_id; idx < OC_per_warp; idx += block_size) {
+        shared[idx] = 0.0f;
     }
     __syncthreads();
-    for (int i = blockIdx.y*block_size_y + threadIdx.y; i < B * T; i += gridDim.y*block_size_y) {
-        x128 packed_dout = load128(dout + global_oc + i*OC);
+    for (int idx = blockIdx.y*block_size_y + threadIdx.y; idx < B * T; idx += gridDim.y*block_size_y) {
+        x128 packed_dout = load128(dout + global_oc + idx*OC);
         for (int k = 0; k < x128::size; k++) {
             accumulators[k] += (float)packed_dout[k];
         }
@@ -1107,8 +1107,7 @@ __global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS)
     // calculate the gradients directly, saves bandwidth from probs during training
     // but also supports writing probs for inference-only and debugging
     const floatX* logits_vec = logits + idx * P;
-    int i = threadIdx.x;
-    for (; i < V/x128::size; i += blockDim.x) {
+    for (int i = threadIdx.x; i < V/x128::size; i += blockDim.x) {
         // this is the 2nd read of logits after the one in prepare_softmax2
         // it will be overwritten by the logits gradients which is when we reduce cache persistence
         x128 packed_logits_vec = load128(logits_vec + i * x128::size); // rely on cs of store128cs
@@ -1132,8 +1131,8 @@ __global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS)
 
     // handle remaining elements after the last multiple of x128::size
     // e.g. if V = 8003, and x128::size = 8, we need to handle the last 3 elements
-    i *= x128::size;
-    for (; i < V; i++) {
+    int unaligned_start = V & ~(x128::size - 1); // round down to multiple of x128::size
+    for (int i = threadIdx.x + unaligned_start; i < V; i++) {
         float prob = expf((float)logits_vec[i] - sp.Offset) * sp.Scale;
         float indicator = (i == ix) ? 1.0f : 0.0f;
         float dlogit = (prob - indicator) * dloss;

From e1f89b304378b1a5aa6ebfa4d829cefe4c3cfd72 Mon Sep 17 00:00:00 2001
From: ademeure <arundemeure@gmail.com>
Date: Tue, 7 May 2024 00:57:01 +0100
Subject: [PATCH 032/172] remove BF16 default from classified_fused before PR

---
 dev/cuda/classifier_fused.cu | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/dev/cuda/classifier_fused.cu b/dev/cuda/classifier_fused.cu
index 55f0d44cb..c44727f73 100644
--- a/dev/cuda/classifier_fused.cu
+++ b/dev/cuda/classifier_fused.cu
@@ -23,10 +23,6 @@ nvcc -O3 --use_fast_math -lcublas -lcublasLt classifier_fused.cu -o classifier_f
 
 // todo - this file does not properly support anything but FP32
 // kernel 5 can be run in fp16/bf16 to test performance, but the outputs will be wrong
-#undef ENABLE_BF16
-#undef ENABLE_FP16
-#define ENABLE_BF16
-
 #if defined(ENABLE_BF16)
 typedef __nv_bfloat16 floatX;
 #elif defined(ENABLE_FP16)

From 3cc16f135412b0269cc6b369078fd50bdd4ebc89 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Tue, 7 May 2024 00:39:25 +0000
Subject: [PATCH 033/172] fix logits bug

---
 train_gpt2.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/train_gpt2.py b/train_gpt2.py
index 1446e6165..7fceaff21 100644
--- a/train_gpt2.py
+++ b/train_gpt2.py
@@ -128,7 +128,7 @@ def __init__(self, config):
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
         self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying
 
-    def forward(self, idx, targets=None):
+    def forward(self, idx, targets=None, return_logits=True):
         device = idx.device
         b, t = idx.size()
         assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
@@ -147,11 +147,16 @@ def forward(self, idx, targets=None):
             # if we are given some desired targets also calculate the loss
             logits = self.lm_head(x)
             loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
-            return None, loss
         else:
             # inference-time mini-optimization: only forward the lm_head on the very last position
             logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
-            return logits, None
+            loss = None
+
+        # there are performance reasons why not returning logits is prudent, if not needed
+        if not return_logits:
+            logits = None
+
+        return logits, loss
 
     @classmethod
     def from_pretrained(cls, model_type):
@@ -538,8 +543,7 @@ def get_batch():
     for i in range(args.num_iterations):
         t0 = time.time()
         with ctx:
-            logits, loss = model(x, y)
-            del logits
+            _, loss = model(x, y, return_logits=False)
         if not args.inference_only:
             optimizer.zero_grad(set_to_none=True)
             loss.backward()

From 25507542152175543b356e1bdc1e39dffb7c2c68 Mon Sep 17 00:00:00 2001
From: Ross Wheeler <ross@rosswheeler.com>
Date: Mon, 6 May 2024 17:41:19 -0700
Subject: [PATCH 034/172] Change FLT_MAX to  flt_max

---
 train_gpt2.cu | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 7a73cb612..d5014d8db 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -54,7 +54,6 @@ This reads & runs in fp32, B=4, T=64, LR=1e-4, val/sample never (200),
 #include "utils.h"
 // defines: tokenizer_init, tokenizer_decode, tokenizer_free
 #include "tokenizer.h"
-#undef FLT_MAX
 
 // ----------------------------------------------------------------------------
 // CUDA precision settings
@@ -683,8 +682,8 @@ __global__ void softmax_forward_kernel5(floatX* out, float inv_temperature, cons
     const floatX* x = inp + idx * T;
 
     // not INF, so we don't get NaNs accidentally when subtracting two values.
-    const float FLT_MAX = 340282346638528859811704183484516925440.0f; // to avoid including float.h
-    float maxval = -FLT_MAX;
+    const float flt_max = 340282346638528859811704183484516925440.0f; // to avoid including float.h
+    float maxval = -flt_max;
     float sumval = 0.0f;
 
     const floatX* x_aligned = reinterpret_cast<const floatX*>(__builtin_assume_aligned(x, 16));

From c26124085296b9763b781a0a625f089d22398b0c Mon Sep 17 00:00:00 2001
From: ademeure <arundemeure@gmail.com>
Date: Tue, 7 May 2024 03:43:27 +0100
Subject: [PATCH 035/172]  3 x 512 threads max for layernorm_backward to avoid
 cache thrashing (hacky -> better way?)

---
 train_gpt2.cu | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 3a339d56d..8893e7660 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -835,7 +835,7 @@ __global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, i
     atomicAdd(dbias + i + blockIdx.x*OC_per_warp, shared[i]);
 }
 
-__global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS)
+__global__ void __launch_bounds__(512, 3) // todo - any warnings on Turing with only 1024 threads?
                 layernorm_backward_kernel8(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
                                             const floatX* dout, const floatX* inp, const floatX* weight,
                                             const floatX* mean, const floatX* rstd,
@@ -1393,8 +1393,11 @@ void layernorm_backward(floatX* dinp, floatX* dweight, floatX* dbias, float* scr
                         const floatX* dout, const floatX* inp, const floatX* weight, const floatX* mean, const floatX* rstd,
                         int B, int T, int C) {
     NVTX_RANGE_FN();
-    const int block_size = 1024;
-    const int grid_size = MAX_1024_THREADS_BLOCKS * deviceProp.multiProcessorCount;
+    // todo - forcing 3 x 512 threads per SM maximum is a bit hacky, but more than that results in
+    // cache thrashing and lower performance on A100... is there a better way?
+    const int block_size = 512;
+    const int blocks_per_sm = min(3, (deviceProp.maxThreadsPerMultiProcessor / 1024));
+    const int grid_size = blocks_per_sm * deviceProp.multiProcessorCount;
     size_t shared_mem_size = (2 * C + 1) * sizeof(float);
 
     cudaMemsetAsync(scratch, 0, (2 * C + 1) * sizeof(float), main_stream);

From b3e8a9fe6758302c76887de16870ca74c55b6401 Mon Sep 17 00:00:00 2001
From: KarhouTam <karhoutam@qq.com>
Date: Wed, 8 May 2024 09:22:14 +0800
Subject: [PATCH 036/172] Implementation of online softmax forward kernel
 without cgs.

---
 dev/cuda/softmax_forward.cu | 92 +++++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)

diff --git a/dev/cuda/softmax_forward.cu b/dev/cuda/softmax_forward.cu
index f611864f0..891c0cb85 100644
--- a/dev/cuda/softmax_forward.cu
+++ b/dev/cuda/softmax_forward.cu
@@ -514,6 +514,88 @@ __global__ void softmax_forward_kernel7(float* out, const float* inp, int N, int
     }
 }
 
+__global__ void softmax_forward_online_kernel8(float* out, const float* inp, int N, int C) {
+    // do the same job as softmax_forward_online_kernel1()
+    // further combines unrolling, shared memory and warp reduce utilities in CUDA
+    extern __shared__ float shared[];
+    const int UNROLL_FACTOR = 8;
+    // FAKE_MAXVAL is set empirically to mimic the maxval that would appear in real situation
+    const float FAKE_MAXVAL = 10.f; 
+    const int warpsPerBlock = blockDim.x / warpSize;
+    int idx = blockIdx.x;
+    int tid = threadIdx.x;
+    int laneId = tid % warpSize;
+    int warpId = tid / warpSize;
+    float* maxvals = shared;
+    float* sumvals = &shared[warpsPerBlock];
+
+    if (tid >= C) {
+        maxvals[warpId] = -INFINITY;
+        sumvals[warpId] = 0.0f;
+        return;
+    }
+    
+    const float* x = inp + idx * C;
+    float* y = out + idx * C;
+
+    // each thread computes partial maxval and sumval in range [::blockDim.x]
+    // after finished this part, each thread in block-0 holds partial maxval and sumval
+    float maxval = -INFINITY, sumval = 0.0f;
+    for (int i = tid; i < C; i += blockDim.x * UNROLL_FACTOR) {
+        #pragma unroll
+        for (int j = 0; j < UNROLL_FACTOR && i + j * blockDim.x < C; ++j) {   
+            maxval = fmaxf(maxval, x[i + j * blockDim.x]);
+            // using FAKE_MAXVAL to avoid keeping updating inter-maxvals
+            sumval += expf(x[i + j * blockDim.x] - FAKE_MAXVAL);
+        }
+    }
+    sumval *= expf(FAKE_MAXVAL - maxval);
+
+    // computes sumval and maxval of each warp (32 threads)
+    // after finished this part, shared memory holds maxval and sumval of each warp
+    
+    float offset_maxval, offset_sumval, tmp_maxval;
+    for (int offset = warpSize / 2; offset > 0; offset >>= 1) {
+        offset_maxval = __shfl_down_sync(0xFFFFFFFF, maxval, offset);
+        offset_sumval = __shfl_down_sync(0xFFFFFFFF, sumval, offset);
+        tmp_maxval = fmaxf(maxval, offset_maxval);
+        sumval = sumval * expf(maxval - tmp_maxval) +
+                offset_sumval * expf(offset_maxval - tmp_maxval);
+        maxval = tmp_maxval;
+    }
+    if (laneId == 0) {
+        sumvals[warpId] = sumval;
+        maxvals[warpId] = maxval;
+    }
+    __syncthreads();
+
+    // computes the global maxval and sumval of row `idx`
+    if (tid < warpsPerBlock / 2) {
+        #pragma unroll
+        for (int offset = warpsPerBlock / 2; offset > 0; offset >>= 1) {
+            if (tid < offset) {
+                tmp_maxval = fmaxf(maxvals[tid], maxvals[tid + offset]);
+                sumvals[tid] = sumvals[tid] * expf(maxvals[tid] - tmp_maxval) +
+                            sumvals[tid + offset] *
+                                expf(maxvals[tid + offset] - tmp_maxval);
+                maxvals[tid] = tmp_maxval;
+            }
+        }
+    }
+    __syncthreads();
+
+    // write the final results into `out`
+    maxval = maxvals[0];
+    float sum = sumvals[0];
+    for (int i = tid; i < C; i += blockDim.x * UNROLL_FACTOR) {
+        #pragma unroll
+        for (int j = 0; j < UNROLL_FACTOR && i + j * blockDim.x < C; ++j) {
+            // __stcs(&y[i + j * blockDim.x], expf(x[i + j * blockDim.x] - maxval) / sum);
+            y[i + j * blockDim.x] = expf(x[i + j * blockDim.x] - maxval) / sum;
+        }
+    }
+}
+
 // ----------------------------------------------------------------------------
 // kernel launcher
 
@@ -560,6 +642,13 @@ void softmax_forward7(float* out, const float* inp, int N, int C, int block_size
     softmax_forward_kernel7<<<grid_size, block_size, shared_mem_size>>>(out, inp, N, C);
 }
 
+void softmax_forward_online8(float* out, const float* inp, int N, int C, int block_size) {
+    const int grid_size = N;
+    size_t shared_mem_size = 2 * block_size / 32 * sizeof(float);
+    softmax_forward_online_kernel8<<<grid_size, block_size, shared_mem_size>>>(out, inp, N, C);
+    cudaCheck(cudaGetLastError());
+}
+
 // kernel version dispatch
 void softmax_forward(int kernel_num, float* out, const float* inp, int N, int C, const int block_size) {
     switch (kernel_num) {
@@ -584,6 +673,9 @@ void softmax_forward(int kernel_num, float* out, const float* inp, int N, int C,
         case 7:
             softmax_forward7(out, inp, N, C, block_size);
             break;
+        case 8:
+            softmax_forward_online8(out, inp, N, C, block_size);
+            break;
         default:
             printf("Invalid kernel number\n");
             exit(1);

From dc901d420bdabb849cb573f60d355abd6eb348f7 Mon Sep 17 00:00:00 2001
From: chinthysl <chin.dev.acc@gmail.com>
Date: Wed, 8 May 2024 04:50:24 +0000
Subject: [PATCH 037/172] set correct gpu using multigpu config

---
 train_gpt2.cu | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 16ff756ce..15d915835 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -428,6 +428,7 @@ MultiGpuConfig multi_gpu_config_init(int *argc, char ***argv) {
     return result;
 #else
     printf("Multi-GPU support is disabled. Using a single GPU.\n");
+    cudaCheck(cudaSetDevice(0));
     MultiGpuConfig result;
     result.process_rank = 0;
     result.num_processes = 1;
@@ -2181,12 +2182,10 @@ void gpt2_free(GPT2 *model) {
 // ----------------------------------------------------------------------------
 // common init & free code for train/test/profile
 void common_start(bool override_enable_tf32 = true, bool print_device_info = true) {
-    int deviceIdx = 0;
-    cudaCheck(cudaSetDevice(deviceIdx));
-    cudaGetDeviceProperties(&deviceProp, deviceIdx);
+    cudaGetDeviceProperties(&deviceProp, multi_gpu_config.local_device_idx);
     if (print_device_info) {
         printf("[System]\n");
-        printf("Device %d: %s\n", deviceIdx, deviceProp.name);
+        printf("Device %d: %s\n", multi_gpu_config.local_device_idx, deviceProp.name);
     }
 
     cudaCheck(cudaStreamCreate(&main_stream));

From 2356be7333323dcd170ff4d8d4ad94d5c50b87d8 Mon Sep 17 00:00:00 2001
From: chinthysl <chin.dev.acc@gmail.com>
Date: Wed, 8 May 2024 05:02:23 +0000
Subject: [PATCH 038/172] set stream to main_stream in ncclAllReduce

---
 train_gpt2.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 15d915835..95938d7e0 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2128,8 +2128,7 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) {
         model->num_parameters,
         ncclFloatX, ncclAvg,
         multi_gpu_config->nccl_comm,
-        // use 0 for default stream (always implicitly synchronised)
-        /*stream=*/0));
+        main_stream));
 #endif
 }
 

From 6a52d8619e6b3ee200d1a07d0c837c919f263486 Mon Sep 17 00:00:00 2001
From: KarhouTam <karhoutam@qq.com>
Date: Wed, 8 May 2024 16:12:39 +0800
Subject: [PATCH 039/172] Optimize codes and comments

---
 dev/cuda/softmax_forward.cu | 107 ++++++++++++++----------------------
 1 file changed, 40 insertions(+), 67 deletions(-)

diff --git a/dev/cuda/softmax_forward.cu b/dev/cuda/softmax_forward.cu
index 891c0cb85..fb084dad8 100644
--- a/dev/cuda/softmax_forward.cu
+++ b/dev/cuda/softmax_forward.cu
@@ -515,84 +515,58 @@ __global__ void softmax_forward_kernel7(float* out, const float* inp, int N, int
 }
 
 __global__ void softmax_forward_online_kernel8(float* out, const float* inp, int N, int C) {
-    // do the same job as softmax_forward_online_kernel1()
-    // further combines unrolling, shared memory and warp reduce utilities in CUDA
-    extern __shared__ float shared[];
-    const int UNROLL_FACTOR = 8;
-    // FAKE_MAXVAL is set empirically to mimic the maxval that would appear in real situation
-    const float FAKE_MAXVAL = 10.f; 
     const int warpsPerBlock = blockDim.x / warpSize;
-    int idx = blockIdx.x;
     int tid = threadIdx.x;
-    int laneId = tid % warpSize;
-    int warpId = tid / warpSize;
-    float* maxvals = shared;
-    float* sumvals = &shared[warpsPerBlock];
 
     if (tid >= C) {
-        maxvals[warpId] = -INFINITY;
-        sumvals[warpId] = 0.0f;
         return;
     }
-    
-    const float* x = inp + idx * C;
-    float* y = out + idx * C;
 
-    // each thread computes partial maxval and sumval in range [::blockDim.x]
-    // after finished this part, each thread in block-0 holds partial maxval and sumval
-    float maxval = -INFINITY, sumval = 0.0f;
-    for (int i = tid; i < C; i += blockDim.x * UNROLL_FACTOR) {
-        #pragma unroll
-        for (int j = 0; j < UNROLL_FACTOR && i + j * blockDim.x < C; ++j) {   
-            maxval = fmaxf(maxval, x[i + j * blockDim.x]);
-            // using FAKE_MAXVAL to avoid keeping updating inter-maxvals
-            sumval += expf(x[i + j * blockDim.x] - FAKE_MAXVAL);
-        }
-    }
-    sumval *= expf(FAKE_MAXVAL - maxval);
+    int warpId = tid / warpSize;
+    int laneId = tid % warpSize;
+    int row = blockIdx.x * warpsPerBlock + warpId;
 
-    // computes sumval and maxval of each warp (32 threads)
-    // after finished this part, shared memory holds maxval and sumval of each warp
-    
-    float offset_maxval, offset_sumval, tmp_maxval;
-    for (int offset = warpSize / 2; offset > 0; offset >>= 1) {
-        offset_maxval = __shfl_down_sync(0xFFFFFFFF, maxval, offset);
-        offset_sumval = __shfl_down_sync(0xFFFFFFFF, sumval, offset);
-        tmp_maxval = fmaxf(maxval, offset_maxval);
-        sumval = sumval * expf(maxval - tmp_maxval) +
-                offset_sumval * expf(offset_maxval - tmp_maxval);
-        maxval = tmp_maxval;
+    if (row >= N) {
+        return;
     }
-    if (laneId == 0) {
-        sumvals[warpId] = sumval;
-        maxvals[warpId] = maxval;
+
+    const float* x = inp + row * C;
+    float* const y = out + row * C;
+
+    // merging calculating maxval and sumval in one loop
+    // which is an arithmetic improvment from online softmax over normal softmax
+    float maxval = -INFINITY, sumval = 0.0f, bigger;
+    for (int i = laneId; i < C; i += warpSize) {
+        // when updating the maxval, dynamically updates the previous sumval by
+        // multiplying e^{previous_maxval - current_maxval}
+        bigger = fmaxf(maxval, x[i]);
+        sumval = sumval * expf(maxval - bigger) + expf(x[i] - bigger);
+        maxval = bigger;
     }
-    __syncthreads();
 
-    // computes the global maxval and sumval of row `idx`
-    if (tid < warpsPerBlock / 2) {
-        #pragma unroll
-        for (int offset = warpsPerBlock / 2; offset > 0; offset >>= 1) {
-            if (tid < offset) {
-                tmp_maxval = fmaxf(maxvals[tid], maxvals[tid + offset]);
-                sumvals[tid] = sumvals[tid] * expf(maxvals[tid] - tmp_maxval) +
-                            sumvals[tid + offset] *
-                                expf(maxvals[tid + offset] - tmp_maxval);
-                maxvals[tid] = tmp_maxval;
-            }
+    // using warp functions instead of cooperative groups for better readibility
+    // calculate the warp wised maxval and sumval
+    float offsetMaxval, offsetSumval;
+    for (int offset = warpSize / 2; offset > 0; offset >>= 1) {
+        __syncwarp();
+        offsetMaxval = __shfl_down_sync(0xFFFFFFFF, maxval, offset);
+        offsetSumval = __shfl_down_sync(0xFFFFFFFF, sumval, offset);
+        if (offsetMaxval > maxval) {
+            sumval *= expf(maxval - offsetMaxval);
+            maxval = offsetMaxval;
+        } else {
+            offsetSumval *= expf(offsetMaxval - maxval);
         }
+        sumval += offsetSumval;
     }
-    __syncthreads();
 
-    // write the final results into `out`
-    maxval = maxvals[0];
-    float sum = sumvals[0];
-    for (int i = tid; i < C; i += blockDim.x * UNROLL_FACTOR) {
-        #pragma unroll
-        for (int j = 0; j < UNROLL_FACTOR && i + j * blockDim.x < C; ++j) {
-            // __stcs(&y[i + j * blockDim.x], expf(x[i + j * blockDim.x] - maxval) / sum);
-            y[i + j * blockDim.x] = expf(x[i + j * blockDim.x] - maxval) / sum;
-        }
+    // retrive the warp wised maxval and sumval
+    // which are also the maxval and sumval of one row in C
+    maxval = __shfl_sync(0xFFFFFFFF, maxval, 0);
+    sumval = __shfl_sync(0xFFFFFFFF, sumval, 0);
+
+    for (int i = laneId; i < C; i += warpSize) {
+        y[i] = expf(x[i] - maxval) / sumval;
     }
 }
 
@@ -643,9 +617,8 @@ void softmax_forward7(float* out, const float* inp, int N, int C, int block_size
 }
 
 void softmax_forward_online8(float* out, const float* inp, int N, int C, int block_size) {
-    const int grid_size = N;
-    size_t shared_mem_size = 2 * block_size / 32 * sizeof(float);
-    softmax_forward_online_kernel8<<<grid_size, block_size, shared_mem_size>>>(out, inp, N, C);
+    const int grid_size = ceil_div(N * 32, block_size);
+    softmax_forward_online_kernel8<<<grid_size, block_size>>>(out, inp, N, C);
     cudaCheck(cudaGetLastError());
 }
 

From 6d7a99cb4e70464228b13b6d80996484fe1c38ad Mon Sep 17 00:00:00 2001
From: KarhouTam <karhoutam@qq.com>
Date: Wed, 8 May 2024 17:45:14 +0800
Subject: [PATCH 040/172] Adjust comments

---
 dev/cuda/softmax_forward.cu | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/dev/cuda/softmax_forward.cu b/dev/cuda/softmax_forward.cu
index fb084dad8..b487b956d 100644
--- a/dev/cuda/softmax_forward.cu
+++ b/dev/cuda/softmax_forward.cu
@@ -515,6 +515,9 @@ __global__ void softmax_forward_kernel7(float* out, const float* inp, int N, int
 }
 
 __global__ void softmax_forward_online_kernel8(float* out, const float* inp, int N, int C) {
+    // online softmax paper: http://arxiv.org/abs/1805.02867
+    // online softmax reduces loops from 3 to 2
+    // which is done by calculating sumval and maxval in one loop
     const int warpsPerBlock = blockDim.x / warpSize;
     int tid = threadIdx.x;
 
@@ -524,6 +527,7 @@ __global__ void softmax_forward_online_kernel8(float* out, const float* inp, int
 
     int warpId = tid / warpSize;
     int laneId = tid % warpSize;
+    // one warp one row
     int row = blockIdx.x * warpsPerBlock + warpId;
 
     if (row >= N) {
@@ -533,7 +537,7 @@ __global__ void softmax_forward_online_kernel8(float* out, const float* inp, int
     const float* x = inp + row * C;
     float* const y = out + row * C;
 
-    // merging calculating maxval and sumval in one loop
+    // merge calculating maxval and sumval in one loop
     // which is an arithmetic improvment from online softmax over normal softmax
     float maxval = -INFINITY, sumval = 0.0f, bigger;
     for (int i = laneId; i < C; i += warpSize) {
@@ -544,7 +548,7 @@ __global__ void softmax_forward_online_kernel8(float* out, const float* inp, int
         maxval = bigger;
     }
 
-    // using warp functions instead of cooperative groups for better readibility
+    // use warp functions instead of cooperative groups for better readibility
     // calculate the warp wised maxval and sumval
     float offsetMaxval, offsetSumval;
     for (int offset = warpSize / 2; offset > 0; offset >>= 1) {
@@ -560,7 +564,7 @@ __global__ void softmax_forward_online_kernel8(float* out, const float* inp, int
         sumval += offsetSumval;
     }
 
-    // retrive the warp wised maxval and sumval
+    // sync the warp wised maxval and sumval
     // which are also the maxval and sumval of one row in C
     maxval = __shfl_sync(0xFFFFFFFF, maxval, 0);
     sumval = __shfl_sync(0xFFFFFFFF, sumval, 0);

From 99e5c5d2835ef25286f2ccf05914970e603de776 Mon Sep 17 00:00:00 2001
From: Paul Maragakis <Paul.Maragakis@DEShawResearch.com>
Date: Wed, 8 May 2024 15:17:25 -0400
Subject: [PATCH 041/172] Allow the code to train gpt2-xl

This commit is the fix by adameure described in the comment below:

https://github.com/karpathy/llm.c/pull/382#issuecomment-2100895501

To reproduce the following bug:

https://github.com/karpathy/llm.c/pull/382#issuecomment-2100648148

First switch to gpt2-xl for tinystories with the following three commands:

python prepro_tinystories.py  # download tinystories

sed -i 's/from_pretrained("gpt2")/from_pretrained("gpt2-xl")/' train_gpt2.py  # select gpt2-xl

python train_gpt2.py --input_bin data/TinyStories_train.bin --batch_size 1  # Prep the large model binaries

Now you can compile and try the code with or without the fix.  On an H100 machine I use the following command:

train_gpt2cu -i data/TinyStories  -b 10 -x 100  # gets 230k/s on one H100 node of 8 GPU
---
 train_gpt2.cu | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 95938d7e0..0150ceba8 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -812,16 +812,18 @@ __global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, i
         shared[idx] = 0.0f;
     }
     __syncthreads();
-    for (int idx = blockIdx.y*block_size_y + threadIdx.y; idx < B * T; idx += gridDim.y*block_size_y) {
-        x128 packed_dout = load128(dout + global_oc + idx*OC);
-        for (int k = 0; k < x128::size; k++) {
-            accumulators[k] += (float)packed_dout[k];
-        }
-    }
-    // we need to avoid shared memory bank conflicts for the atomicAdd to maximise performance
-    // so we accumulate in a conflict-free order, then reorder to match the global memory order
-    for (int k = 0; k < x128::size; k++) {
-        atomicAdd(shared + threadIdx.x + (k * block_size_x), accumulators[k]);
+    if(global_oc < OC) {
+        for (int idx = blockIdx.y*block_size_y + threadIdx.y; idx < B * T; idx += gridDim.y*block_size_y) {
+            x128 packed_dout = load128(dout + global_oc + idx*OC);
+            for (int k = 0; k < x128::size; k++) {
+                accumulators[k] += (float)packed_dout[k];
+            }
+	}
+	// we need to avoid shared memory bank conflicts for the atomicAdd to maximise performance
+	// so we accumulate in a conflict-free order, then reorder to match the global memory order
+	for (int k = 0; k < x128::size; k++) {
+            atomicAdd(shared + threadIdx.x + (k * block_size_x), accumulators[k]);
+	}
     }
     if (threadIdx.y >= x128::size) { return; } // only need this many warps to reorder the data
     __syncthreads();
@@ -834,7 +836,9 @@ __global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, i
     shared[local_oc + threadIdx.y] = tmp;
     __syncthreads();
     // now we do a perfectly coalesced atomic add to global memory (1x 128-byte cacheline per warp)
-    atomicAdd(dbias + i + blockIdx.x*OC_per_warp, shared[i]);
+    if (i + blockIdx.x*OC_per_warp < OC) {
+        atomicAdd(dbias + i + blockIdx.x*OC_per_warp, shared[i]);
+    }
 }
 
 __global__ void __launch_bounds__(512, 3) // todo - any warnings on Turing with only 1024 threads?
@@ -1366,11 +1370,10 @@ void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias,
         const int OC_per_warp = warp_size * x128::size; // 256 at BF16
         const int block_size_x = 32;
         const int block_size_y = block_size / block_size_x; // 16
-        const int grid_size_x = OC / OC_per_warp; // e.g. 3 horizontal blocks for 768 OCs at BF16
+        const int grid_size_x = CEIL_DIV(OC, OC_per_warp); // e.g. 3 horizontal blocks for 768 OCs at BF16
         const int grid_size_y = max(1, deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount
                                      / (block_size * grid_size_x)); // full GPU!
 
-        assert((OC % OC_per_warp) == 0); // there is no bounds checking in the kernel to maximise performance
         assert(block_size_y >= x128::size); // part of the kernel assumes this is large enough to avoid loops
 
         cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float), main_stream);

From 0ce5fcf7e8be6d8865cfdc0884ebe05f9a31cfdd Mon Sep 17 00:00:00 2001
From: Anerudhan <agopal@nvidia.com>
Date: Wed, 8 May 2024 19:53:48 +0000
Subject: [PATCH 042/172] Rename cudnn_att.cu to cudnn_att.cpp to speed up
 compilation.

---
 Makefile                      | 4 ++--
 cudnn_att.cu => cudnn_att.cpp | 0
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename cudnn_att.cu => cudnn_att.cpp (100%)

diff --git a/Makefile b/Makefile
index c4879588b..60e60b6a7 100644
--- a/Makefile
+++ b/Makefile
@@ -237,7 +237,7 @@ train_gpt2: train_gpt2.c
 test_gpt2: test_gpt2.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $^ $(LDLIBS) $(OUTPUT_FILE)
 
-$(NVCC_CUDNN): cudnn_att.cu
+$(NVCC_CUDNN): cudnn_att.cpp
 	$(NVCC) -c $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_INCLUDES) 
 
 train_gpt2cu: train_gpt2.cu $(NVCC_CUDNN)
@@ -256,4 +256,4 @@ profile_gpt2cu: profile_gpt2.cu $(NVCC_CUDNN)
 	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS)  $(CUDA_OUTPUT_FILE) 
 
 clean:
-	$(REMOVE_FILES) $(TARGETS)
+	$(REMOVE_FILES) $(TARGETS) $(NVCC_CUDNN)
diff --git a/cudnn_att.cu b/cudnn_att.cpp
similarity index 100%
rename from cudnn_att.cu
rename to cudnn_att.cpp

From 84a2aa0dd08039493a7226c505bad919e594b870 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Wed, 8 May 2024 20:13:14 +0000
Subject: [PATCH 043/172] allow exporting all GPT-2 model sizes now, using
 --model flag in python script. also allow loading all the models from C,
 using the new flag -e, to point directly to the file to load. added some
 error handling for potentially common mistakes. note that loadin GPT-2 XL
 does not work yet and crashes, but I am imminently merging a fix as the next
 commit that resolves this

---
 train_gpt2.cu | 17 ++++++++++++++---
 train_gpt2.py | 12 ++++++++----
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 95938d7e0..4374682ec 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -69,7 +69,6 @@ enum PrecisionMode {
 typedef float floatX;
 #define CUBLAS_LOWP CUDA_R_32F
 #define PRECISION_MODE PRECISION_FP32
-const char* load_filename = "gpt2_124M.bin";
 #ifdef MULTI_GPU
 const ncclDataType_t ncclFloatX = ncclFloat;
 #endif
@@ -79,7 +78,6 @@ const ncclDataType_t ncclFloatX = ncclFloat;
 typedef half floatX;
 #define CUBLAS_LOWP CUDA_R_16F
 #define PRECISION_MODE PRECISION_FP16
-const char* load_filename = "gpt2_124M.bin";
 #ifdef MULTI_GPU
 const ncclDataType_t ncclFloatX = ncclHalf;
 #endif
@@ -88,7 +86,6 @@ const ncclDataType_t ncclFloatX = ncclHalf;
 typedef __nv_bfloat16 floatX;
 #define CUBLAS_LOWP CUDA_R_16BF
 #define PRECISION_MODE PRECISION_BF16
-const char* load_filename = "gpt2_124M_bf16.bin"; // bf16 weights specific filename
 #ifdef MULTI_GPU
 const ncclDataType_t ncclFloatX = ncclBfloat16;
 #endif
@@ -1744,6 +1741,17 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
         fprintf(stderr, "---> HINT: try to re-run `python train_gpt2.py`\n");
         exit(EXIT_FAILURE);
     }
+    if (PRECISION_MODE == PRECISION_BF16 && version != 5) {
+        fprintf(stderr, "Precision is configured as BF16 but model at %s is not.\n", checkpoint_path);
+        fprintf(stderr, "---> HINT: are you sure you're loading a _bf16.bin file?\n");
+        exit(EXIT_FAILURE);
+    }
+    if (PRECISION_MODE == PRECISION_FP32 && version != 3) {
+        fprintf(stderr, "Precision is configured as FP32 but model at %s is not.\n", checkpoint_path);
+        fprintf(stderr, "---> HINT: to turn on FP32 you have to compile like: `make train_gpt2cu PRECISION=FP32`\n");
+        fprintf(stderr, "---> HINT: are you sure you're loading a .bin file without any _bf16 in the name?\n");
+        exit(EXIT_FAILURE);
+    }
 
     // read in hyperparameters
     model->config.max_seq_len = model_header[2];
@@ -2370,6 +2378,7 @@ void error_usage() {
     fprintf(stderr, "Example: ./train_gpt2cu -i data/TinyStories -v 100 -s 100 -g 144 -o stories.log\n");
     fprintf(stderr, "Options:\n");
     fprintf(stderr, "  -i <string> input dataset prefix (default = data/tiny_shakespeare)\n");
+    fprintf(stderr, "  -e <string> input model filename (default = gpt2_124M_bf16.bin)\n");
     fprintf(stderr, "  -o <string> output log file (default = NULL)\n");
     fprintf(stderr, "  -b <int>    batch size B (default = 4)\n");
     fprintf(stderr, "  -t <int>    sequence length T (default = 1024)\n");
@@ -2392,6 +2401,7 @@ int main(int argc, char *argv[]) {
 
     // read in the (optional) command line arguments
     const char* input_dataset_prefix = "data/tiny_shakespeare"; // or e.g. data/TinyStories
+    const char* load_filename = "gpt2_124M_bf16.bin"; // bf16 weights of the model
     const char* output_log_file = NULL;
     int B = 4; // batch size
     int T = 1024; // sequence length max
@@ -2410,6 +2420,7 @@ int main(int argc, char *argv[]) {
         if (strlen(argv[i]) != 2) { error_usage(); } // must be -x (one dash, one letter)
         // read in the args
         if (argv[i][1] == 'i') { input_dataset_prefix = argv[i+1]; }
+        else if (argv[i][1] == 'e') { load_filename = argv[i+1]; }
         else if (argv[i][1] == 'o') { output_log_file = argv[i+1]; }
         else if (argv[i][1] == 'b') { B = atoi(argv[i+1]); } // Per-GPU batch size
         else if (argv[i][1] == 't') { T = atoi(argv[i+1]); }
diff --git a/train_gpt2.py b/train_gpt2.py
index 7fceaff21..80547b8f1 100644
--- a/train_gpt2.py
+++ b/train_gpt2.py
@@ -395,6 +395,7 @@ def print0(*args, **kwargs):
     # python train_gpt2.py --inference_only 1 --write_tensors 0 --sequence_length 1024
     parser = argparse.ArgumentParser()
     parser.add_argument("--input_bin", type=str, default="data/tiny_shakespeare_val.bin", help="input .bin to train on")
+    parser.add_argument("--model", type=str, default="gpt2", help="gpt2|gpt2-medium|gpt2-large|gpt2-xl")
     parser.add_argument("--write_tensors", type=int, default=1, help="write tensors to disk")
     parser.add_argument("--inference_only", type=int, default=0, help="only run inference")
     parser.add_argument("--dtype", type=str, default="float32", help="float32|float16|bfloat16")
@@ -409,6 +410,8 @@ def print0(*args, **kwargs):
     B, T = args.batch_size, args.sequence_length
     assert 1 <= T <= 1024
     assert args.dtype in {"float32", "float16", "bfloat16"}
+    assert args.model in {"gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"}
+    model_to_size = {"gpt2": "124M", "gpt2-medium": "355M", "gpt2-large": "774M", "gpt2-xl": "1558M"}
 
     # set up DDP (distributed data parallel). torchrun sets this env variable
     ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run?
@@ -469,7 +472,7 @@ def print0(*args, **kwargs):
         write_tokenizer(enc, "gpt2_tokenizer.bin")
 
     # load the GPT-2 model weights
-    model = GPT.from_pretrained("gpt2")
+    model = GPT.from_pretrained(args.model)
     model.train()
     model.to(device)
     if args.compile:
@@ -519,11 +522,12 @@ def get_batch():
         logits, loss = model(x, y)
         loss.backward()
         # save model params, in both float32 and bfloat16
-        write_model(model, "gpt2_124M.bin", dtype="float32")
-        write_model(model, "gpt2_124M_bf16.bin", dtype="bfloat16")
+        model_size_str = model_to_size[args.model] # e.g. "124M"
+        write_model(model, f"gpt2_{model_size_str}.bin", dtype="float32")
+        write_model(model, f"gpt2_{model_size_str}_bf16.bin", dtype="bfloat16")
         # save x, y, logits, loss, and parameter gradients, for debugging C
         # always store these in fp32 to have an accurate reference (?)
-        write_state(model, x, y, logits, loss, "gpt2_124M_debug_state.bin")
+        write_state(model, x, y, logits, loss, f"gpt2_{model_size_str}_debug_state.bin")
 
     # -------------------------------------------------------------------------
     # STAGE 2: training loop to get timings

From 26dbbc75c29b618d0038c92776424124e57d0598 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Wed, 8 May 2024 20:18:33 +0000
Subject: [PATCH 044/172] unbreak the tests oops

---
 test_gpt2.cu | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/test_gpt2.cu b/test_gpt2.cu
index d7944125c..7613b6ba3 100644
--- a/test_gpt2.cu
+++ b/test_gpt2.cu
@@ -85,8 +85,16 @@ float* float_cpu_malloc_and_point_parameters(FloatParameterTensors* params, size
 int main(int argc, char *argv[]) {
     common_start(false, true);
 
+    // set the right paths
+    #if defined(ENABLE_BF16)
+    const char* load_filename = "gpt2_124M_bf16.bin";
+    #else
+    const char* load_filename = "gpt2_124M.bin";
+    #endif
+
     // build the GPT-2 model from a checkpoint
     GPT2 model;
+
     gpt2_build_from_checkpoint(&model, load_filename);
     size_t V = model.config.vocab_size;
     size_t Vp = model.config.padded_vocab_size;

From 69aa64cc80410b590437eb5b41cb348b7da45254 Mon Sep 17 00:00:00 2001
From: Joe Halabi <jhalabi@nvidia.com>
Date: Tue, 7 May 2024 17:18:03 -0700
Subject: [PATCH 045/172] Adds cuDNN install instructions for a smaller but
 sufficient package

- Modifies README.md to provide example apt-get cuDNN install instructions that install the cuDNN dev package. "sudo apt-get install -y cudnn" will install the default cuDNN packages, but for a minimal setup, installing the dev package will see a 50% reduction in both, download size (~850MB to 425MB now) and local storage size (~2GB to ~1GB now).

- Modifies the Makefile to point users to the README for cuDNN install instructions (through comments and the cuDNN install error message)

- Modifies attention_forward.cu comments to point users to the README for cuDNN install instructions

Signed-off-by: Vedaanta Agarwalla <vagarwalla@ipp2-1949.nvidia.com>
---
 Makefile                      | 10 +++-------
 README.md                     | 13 +++++++++++--
 dev/cuda/attention_forward.cu |  4 ++--
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/Makefile b/Makefile
index 60e60b6a7..eabb5646d 100644
--- a/Makefile
+++ b/Makefile
@@ -87,11 +87,7 @@ endif
 
 # Check and include cudnn if available
 # You can override the path to cudnn frontend by setting CUDNN_FRONTEND_PATH=your_path on the make command line
-# You need cuDNN from: https://developer.nvidia.com/cudnn
-# Follow the apt-get instructions or Windows instructions to install the cuDNN library
-# And the cuDNN front-end from: https://github.com/NVIDIA/cudnn-frontend/tree/main
-# For this there is no installation, just download the repo to your home directory or directory of your choice
-# and then we include it below (see currently hard-coded path assumed in home directory)
+# Refer to the README for cuDNN install instructions
 ifeq ($(USE_CUDNN), 1)
   ifeq ($(SHELL_UNAME), Linux)
     # hard-coded path for now in either . or ($HOME) directory 
@@ -103,7 +99,7 @@ ifeq ($(USE_CUDNN), 1)
       $(info ✓ cuDNN found, will run with flash-attention)
       CUDNN_FRONTEND_PATH ?= cudnn-frontend/include
     else
-      $(error ✗ cuDNN not found. See the Makefile for our currently hard-coded paths / install instructions)
+      $(error ✗ cuDNN not found. See the README for install instructions and the Makefile for hard-coded paths)
     endif
     NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH)
     NVCC_LDFLAGS += -lcudnn
@@ -119,7 +115,7 @@ ifeq ($(USE_CUDNN), 1)
       else ifeq ($(shell if exist "cudnn-frontend\include" (echo exists)),exists)
         CUDNN_FRONTEND_PATH ?= cudnn-frontend\include #override on command line if different location
       else
-        $(error ✗ cuDNN not found. See the Makefile for our currently hard-coded paths / install instructions) 
+        $(error ✗ cuDNN not found. See the README for install instructions and the Makefile for hard-coded paths) 
       endif
       CUDNN_INCLUDE_PATH ?= -I"C:\Program Files\NVIDIA\CUDNN\v9.1\include\12.4"
       CUDNN_FRONTEND_PATH += $(CUDNN_INCLUDE_PATH)
diff --git a/README.md b/README.md
index dbb99e030..29d53689e 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ make train_gpt2cu
 ./train_gpt2cu
 ```
 
-If you additionally install cuDNN (see `Makefile` for instructions), you can also go faster with flash attention
+If you additionally install cuDNN (see the CUDA section below), you can also go faster with flash attention
 
 ```bash
 make train_gpt2cu USE_CUDNN=1
@@ -256,7 +256,16 @@ If you have the latest CUDA you should expect this to compile OK, and you should
 make train_gpt2cu USE_CUDNN=1
 ```
 
-This will try to compile with cudnn and run it. You have to have cuDNN installed on your system. Follow the [cuDNN installation instructions](https://developer.nvidia.com/cudnn) to install cuDNN with apt-get. On top of this you need the [cuDNN frontend](https://github.com/NVIDIA/cudnn-frontend/tree/main), but this is just header files. So simply download the repo to your disk, currently assumed to be in your home directory (i.e. the Makefile looks for `~/cudnn-frontend/include`).
+This will try to compile with cudnn and run it. You have to have cuDNN installed on your system. The [cuDNN installation instructions](https://developer.nvidia.com/cudnn) with apt-get will grab the default set of cuDNN packages. For a minimal setup, the cuDNN dev package is sufficient, e.g. on Ubuntu 22.04 for CUDA 12.x:
+
+```bash
+wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
+sudo apt-get update
+sudo apt-get -y install libcudnn9-dev-cuda-12
+```
+
+On top of this you need the [cuDNN frontend](https://github.com/NVIDIA/cudnn-frontend/tree/main), but this is just header files. So simply download the repo to your disk, currently assumed to be in your home directory (i.e. the Makefile looks for `~/cudnn-frontend/include`).
 
 **Multi-GPU training**. As of April 26, 2024 there is now also support for multi-GPU training using MPI and NCCL. Make sure you install MPI, e.g. on Linux:
 
diff --git a/dev/cuda/attention_forward.cu b/dev/cuda/attention_forward.cu
index a9325f085..8b6aaf61d 100644
--- a/dev/cuda/attention_forward.cu
+++ b/dev/cuda/attention_forward.cu
@@ -2,8 +2,8 @@
 Kernels for attention forward pass.
 
 If you do not have CUDNN, you can remove ENABLE_CUDNN to run the other kernels
-You need cuDNN from: https://developer.nvidia.com/cudnn
-And the cuDNN front-end from: https://github.com/NVIDIA/cudnn-frontend/tree/main
+
+See the README for cuDNN install instructions
 
 Compile example with cuDNN:
 nvcc -I/PATH/TO/cudnn-frontend/include -DENABLE_CUDNN -O3 --use_fast_math -lcublas -lcudnn attention_forward.cu -o attention_forward

From 99e765de8d7e743994268d32e64e192be5e3874f Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Thu, 9 May 2024 11:19:24 +0200
Subject: [PATCH 046/172] moved bf16 boilerplate to common.h

---
 dev/cuda/attention_forward.cu  |  9 ++------
 dev/cuda/common.h              | 41 ++++++++++++++++++++++++++++++++++
 dev/cuda/encoder_forward.cu    | 16 +------------
 dev/cuda/gelu_backward.cu      | 16 +------------
 dev/cuda/gelu_forward.cu       | 16 +------------
 dev/cuda/layernorm_backward.cu | 15 +------------
 dev/cuda/residual_forward.cu   | 15 +------------
 7 files changed, 48 insertions(+), 80 deletions(-)

diff --git a/dev/cuda/attention_forward.cu b/dev/cuda/attention_forward.cu
index a9325f085..66cfa629b 100644
--- a/dev/cuda/attention_forward.cu
+++ b/dev/cuda/attention_forward.cu
@@ -53,14 +53,9 @@ version 11 is kernel 10 skipping FP16/FP32 conversions (full FP16/BF16 network)
 #include <cuda_bf16.h>
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
-#include "common.h"
 
-// ----------------------------------------------------------------------------
-// Floating point precision setup
-typedef __nv_bfloat16 floatX; // half or __nv_bfloat16 (or float)
-#define CUBLAS_LOWP CUDA_R_16BF // CUDA_R_16F or CUDA_R_16BF (or CUDA_R_32F)
-// CUBLAS_COMPUTE_32F or CUBLAS_COMPUTE_16F (for CUDA_R_16F only, potentially slower?!)
-#define CUBLAS_LOWP_COMPUTE CUBLAS_COMPUTE_32F
+#define ENABLE_BF16
+#include "common.h"
 
 // ----------------------------------------------------------------------------
 // CUDA & cuDNN setup
diff --git a/dev/cuda/common.h b/dev/cuda/common.h
index 63d0e1de1..788a8f505 100644
--- a/dev/cuda/common.h
+++ b/dev/cuda/common.h
@@ -117,6 +117,47 @@ __device__ void store128cg(ElementType* target, Packed128<ElementType> value) {
     __stcg(reinterpret_cast<int4*>(target), value.get_bits());
 }
 
+// ----------------------------------------------------------------------------
+// reduced/mixed precision utilities
+
+#if defined(ENABLE_BF16)
+
+typedef __nv_bfloat16 floatX;
+typedef __nv_bfloat16 floatN;
+#define CUBLAS_LOWP CUDA_R_16BF // CUDA_R_16F or CUDA_R_16BF (or CUDA_R_32F)
+// CUBLAS_COMPUTE_32F or CUBLAS_COMPUTE_16F (for CUDA_R_16F only, potentially slower?!)
+#define CUBLAS_LOWP_COMPUTE CUBLAS_COMPUTE_32F
+
+#elif defined(ENABLE_FP16)
+
+typedef half floatX;
+typedef half floatN;
+
+#else
+
+typedef float floatX;
+typedef float floatN;
+#endif
+
+typedef Packed128<floatX> x128;
+
+
+// older nvcc does not provide __ldcs and __stcs for bfloat16, despite these actually just being unsigned shorts.
+// we need to be careful here to only define our own versions if none already exist, otherwise the compiler will
+// complain.
+// If not, you easily get "no viable overload" (for sm52) and "function already exists" (sm_80)
+#if defined(ENABLE_BF16) && (__CUDACC_VER_MAJOR__ < 12) && !((__CUDA_ARCH__ >= 800) || !defined(__CUDA_ARCH__))
+__device__ floatX __ldcs(const floatX* address) {
+    unsigned short bf = __ldcs(reinterpret_cast<const unsigned short*>(address));
+    return __nv_bfloat16_raw{bf};
+}
+
+__device__ void __stcs(floatX* address, floatX value) {
+    __stcs(reinterpret_cast<unsigned short*>(address), ((__nv_bfloat16_raw)value).x);
+}
+#endif
+
+
 // ----------------------------------------------------------------------------
 // random utils
 
diff --git a/dev/cuda/encoder_forward.cu b/dev/cuda/encoder_forward.cu
index 16df62f34..e901fd654 100644
--- a/dev/cuda/encoder_forward.cu
+++ b/dev/cuda/encoder_forward.cu
@@ -17,24 +17,10 @@ version 3 is like version 2 but uses float4 reads/writes
 #include <stdio.h>
 #include <stdlib.h>
 #include <cuda_runtime.h>
-#include "common.h"
 #include <cassert>
 
-// turn on bf16 as default, done up here for now
 #define ENABLE_BF16
-
-#if defined(ENABLE_BF16)
-typedef __nv_bfloat16 floatX;
-typedef __nv_bfloat16 floatN;
-#elif defined(ENABLE_FP16)
-typedef half floatX;
-typedef half floatN;
-#else
-typedef float floatX;
-typedef float floatN;
-#endif
-
-typedef Packed128<floatX> x128;
+#include "common.h"
 
 // ----------------------------------------------------------------------------
 // CPU code reference
diff --git a/dev/cuda/gelu_backward.cu b/dev/cuda/gelu_backward.cu
index 8c64d7ca3..bbd81c4bc 100644
--- a/dev/cuda/gelu_backward.cu
+++ b/dev/cuda/gelu_backward.cu
@@ -19,23 +19,9 @@ version 2 uses the Packed128 data structure
 #include <stdio.h>
 #include <stdlib.h>
 #include <cuda_runtime.h>
-#include "common.h"
 
-// turn on bf16 as default, done up here for now
 #define ENABLE_BF16
-
-#if defined(ENABLE_BF16)
-typedef __nv_bfloat16 floatX;
-typedef __nv_bfloat16 floatN;
-#elif defined(ENABLE_FP16)
-typedef half floatX;
-typedef half floatN;
-#else
-typedef float floatX;
-typedef float floatN;
-#endif
-
-typedef Packed128<floatX> x128;
+#include "common.h"
 
 // ----------------------------------------------------------------------------
 // CPU code reference
diff --git a/dev/cuda/gelu_forward.cu b/dev/cuda/gelu_forward.cu
index 27aa9d598..e07ad663a 100644
--- a/dev/cuda/gelu_forward.cu
+++ b/dev/cuda/gelu_forward.cu
@@ -19,23 +19,9 @@ version 2 is bfloat16 with the Packed128 data structure
 #include <stdio.h>
 #include <stdlib.h>
 #include <cuda_runtime.h>
-#include "common.h"
 
-// turn on bf16 as default, done up here for now
 #define ENABLE_BF16
-
-#if defined(ENABLE_BF16)
-typedef __nv_bfloat16 floatX;
-typedef __nv_bfloat16 floatN;
-#elif defined(ENABLE_FP16)
-typedef half floatX;
-typedef half floatN;
-#else
-typedef float floatX;
-typedef float floatN;
-#endif
-
-typedef Packed128<floatX> x128;
+#include "common.h"
 
 // ----------------------------------------------------------------------------
 // CPU code reference
diff --git a/dev/cuda/layernorm_backward.cu b/dev/cuda/layernorm_backward.cu
index 1f432ba82..b3084e126 100644
--- a/dev/cuda/layernorm_backward.cu
+++ b/dev/cuda/layernorm_backward.cu
@@ -17,22 +17,9 @@ version 2 moves a lot of reduction to shared memory over global memory
 #include <assert.h>
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
-#include "common.h"
 
-// turn on bf16 as default, done up here for now
 #define ENABLE_BF16
-
-#if defined(ENABLE_BF16)
-typedef __nv_bfloat16 floatX;
-typedef __nv_bfloat16 floatN;
-#elif defined(ENABLE_FP16)
-typedef half floatX;
-typedef half floatN;
-#else
-typedef float floatX;
-typedef float floatN;
-#endif
-typedef Packed128<floatX> x128;
+#include "common.h"
 
 // ----------------------------------------------------------------------------
 // CPU code reference
diff --git a/dev/cuda/residual_forward.cu b/dev/cuda/residual_forward.cu
index bbbcde270..f07871a29 100644
--- a/dev/cuda/residual_forward.cu
+++ b/dev/cuda/residual_forward.cu
@@ -13,23 +13,10 @@ version 2 packs input into 128 bit memory reads
 #include <stdio.h>
 #include <stdlib.h>
 #include <cuda_runtime.h>
-#include "common.h"
 
-// turn on bf16 as default, done up here for now
 #define ENABLE_BF16
+#include "common.h"
 
-#if defined(ENABLE_BF16)
-typedef __nv_bfloat16 floatX;
-typedef __nv_bfloat16 floatN;
-#elif defined(ENABLE_FP16)
-typedef half floatX;
-typedef half floatN;
-#else
-typedef float floatX;
-typedef float floatN;
-#endif
-
-typedef Packed128<floatX> x128;
 // ----------------------------------------------------------------------------
 // CPU code reference lol
 

From 01c7a331bbcefa08ecb5cac21e0b9dacb79ef53f Mon Sep 17 00:00:00 2001
From: Anerudhan <agopal@nvidia.com>
Date: Thu, 9 May 2024 19:07:00 +0000
Subject: [PATCH 047/172] - Simplify graph cache and usage of cudnn. - Fix
 failures in H100

---
 cudnn_att.cpp | 133 +++++++++++++++++++++++++++-----------------------
 1 file changed, 71 insertions(+), 62 deletions(-)

diff --git a/cudnn_att.cpp b/cudnn_att.cpp
index fd9760b1a..04b1a92ec 100644
--- a/cudnn_att.cpp
+++ b/cudnn_att.cpp
@@ -60,38 +60,35 @@ static void checkCudnnFE(fe::error_object e, const char *file, int line) {
 }
 #define checkCudnnFE(err) checkCudnnFE(err, __FILE__, __LINE__)
 
-using graph_tensors_fwd = std::tuple<std::shared_ptr<fe::graph::Graph>,
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // Q,
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // K,
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // V,
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // Attn_scale,
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // O
-    std::shared_ptr<fe::graph::Tensor_attributes> // Stats
->;
-
-using graph_tensors_bwd = std::tuple<std::shared_ptr<fe::graph::Graph>,
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // Q,
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // K,
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // V,
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // O
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // dO
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // Stats
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // Attn_scale,
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // dQ,
-    std::shared_ptr<fe::graph::Tensor_attributes>,  // dK,
-    std::shared_ptr<fe::graph::Tensor_attributes> // dV
->;
+enum UIDs {
+    Q_UID,
+    K_UID,
+    V_UID,
+    Attn_scale_UID,
+    O_UID,
+    Stats_UID,
+    dO_UID,
+    dQ_UID,
+    dK_UID,
+    dV_UID
+};
 
 // Need a cache because graph->build_operation_graph() is slow but everything else seems fast
-using cache_type_fwd = std::unordered_map<std::size_t, graph_tensors_fwd>;
-using cache_type_bwd = std::unordered_map<std::size_t, graph_tensors_bwd>;
+using cache_type_fwd = std::map<std::tuple<int,int,int,int, int>, std::shared_ptr<fe::graph::Graph>>;
+using cache_type_bwd = std::map<std::tuple<int,int,int,int>, std::shared_ptr<fe::graph::Graph>>;
 
 // Loosely based on cuDNN frontend samples functions and massively simplified
-template <typename... Args>
-auto lookup_cache_or_build_graph_fwd(Args... args) {
+auto lookup_cache_or_build_graph_fwd(int B,int H,int T,int HS, int is_inference_only) {
+
     static cache_type_fwd user_maintained_cache_fwd;
-    auto [B, H, T, HS, is_inference_only] = std::make_tuple(args...);
 
+    auto key = std::make_tuple(B, H, T, HS, is_inference_only);
+
+    auto it = user_maintained_cache_fwd.find(key);
+    if (it != user_maintained_cache_fwd.end()) {
+        return it->second;
+    }
+    
     auto graph = std::make_shared<fe::graph::Graph>();
     graph->set_io_data_type(CUDNN_16BIT)
           .set_intermediate_data_type(fe::DataType_t::FLOAT)
@@ -100,16 +97,20 @@ auto lookup_cache_or_build_graph_fwd(Args... args) {
     // QKV is (B, T, 3, NH, HS) which cuDNN can handle directly without an external permute
     auto Q = graph->tensor(fe::graph::Tensor_attributes().set_name("Q")
                                .set_dim({B, H, T, HS})
+                               .set_uid(Q_UID)
                                .set_stride({3 * H * HS * T,  HS, 3 * H * HS, 1}));
     auto K = graph->tensor(fe::graph::Tensor_attributes().set_name("K")
                                .set_dim({B, H, T, HS})
+                               .set_uid(K_UID)
                                .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1}));
     auto V = graph->tensor(fe::graph::Tensor_attributes().set_name("V")
                                .set_dim({B, H, T, HS})
+                               .set_uid(V_UID)
                                .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1}));
     auto attn_scale = graph->tensor(fe::graph::Tensor_attributes().set_name("attn_scale")
                                .set_dim({1, 1, 1, 1})
                                .set_stride({1, 1, 1, 1})
+                               .set_uid(Attn_scale_UID)
                                .set_is_pass_by_value(true)
                                .set_data_type(fe::DataType_t::FLOAT));
 
@@ -122,38 +123,47 @@ auto lookup_cache_or_build_graph_fwd(Args... args) {
     auto [O, stats] = graph->sdpa(Q, K, V, sdpa_options);
 
     // Output is (B, T, NH, HS) BF16/FP16 and stats for backward pass is (B, NH, T) FP32
-    O->set_output(true).set_dim({B, H, T, HS}).set_stride({H * HS * T, HS, H * HS, 1});
+    O->set_output(true).set_dim({B, H, T, HS}).set_stride({H * HS * T, HS, H * HS, 1}).set_uid(O_UID);
 
     assert(stats == nullptr || is_inference_only == false);
     if (is_inference_only == false) {
         stats->set_output(true).set_data_type(fe::DataType_t::FLOAT)
                                .set_dim({B, H, T, 1})
-                               .set_stride({H * T, T, 1, 1});
+                               .set_stride({H * T, T, 1, 1})
+                               .set_uid(Stats_UID);
     }
 
     checkCudnnFE(graph->validate());
-    auto key = graph->key();
-    auto it = user_maintained_cache_fwd.find(key);
-    if (it != user_maintained_cache_fwd.end()) {
-        return it->second;
-    }
 
     // Build the operation graph and execution part (this is the VERY SLOW PART)
     checkCudnnFE(graph->build_operation_graph(cudnn_handle));
     auto plans = graph->create_execution_plans({fe::HeurMode_t::A});
     checkCudnnFE(graph->check_support(cudnn_handle));
     checkCudnnFE(graph->build_plans(cudnn_handle));
-    assert(graph->get_workspace_size() <= cudnn_workspace_size); // fwd shouldn't need workspace
+    // Reallocate the workspace if the required size is greater than the current workspace
+    // In H100 this may be around 16B
+    if (graph->get_workspace_size() > cudnn_workspace_size) {
+        if (cudnn_workspace_size > 0) {
+            cudaCheck(cudaFree(cudnn_workspace));
+        }
+        cudnn_workspace_size = graph->get_workspace_size();
+        cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size));
+    }
 
-    auto tuple = std::make_tuple(graph, Q, K, V, attn_scale, O, stats);
-    user_maintained_cache_fwd.insert({key, tuple});
-    return tuple;
+    user_maintained_cache_fwd.insert({key, graph});
+
+    return graph;
 }
 
-template <typename... Args>
-auto lookup_cache_or_build_graph_bwd(Args... args) {
+auto lookup_cache_or_build_graph_bwd(int B, int NH, int T, int HS) {
     static cache_type_bwd user_maintained_cache_bwd;
-    auto [B, NH, T, HS] = std::make_tuple(args...);
+
+    auto key = std::make_tuple(B, NH, T, HS);
+
+    auto it = user_maintained_cache_bwd.find(key);
+    if (it != user_maintained_cache_bwd.end()) {
+        return it->second;
+    }
 
     auto graph = std::make_shared<fe::graph::Graph>();
     graph->set_io_data_type(CUDNN_16BIT)
@@ -164,28 +174,35 @@ auto lookup_cache_or_build_graph_bwd(Args... args) {
     // must come from inp (which means we also need to convert THAT to FP16)
     auto Q = graph->tensor(fe::graph::Tensor_attributes().set_name("Q")
                             .set_dim({B, NH, T, HS})
+                            .set_uid(Q_UID)
                             .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}));
     auto K = graph->tensor(fe::graph::Tensor_attributes().set_name("K")
                             .set_dim({B, NH, T, HS})
+                            .set_uid(K_UID)
                             .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}));
     auto V = graph->tensor(fe::graph::Tensor_attributes().set_name("V")
                             .set_dim({B, NH, T, HS})
+                            .set_uid(V_UID)
                             .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}));
     auto O = graph->tensor(fe::graph::Tensor_attributes().set_name("O")
                             .set_dim({B, NH, T, HS})
+                            .set_uid(O_UID)
                             .set_stride({NH * HS * T, HS, NH * HS, 1}));
     auto dO = graph->tensor(fe::graph::Tensor_attributes().set_name("dO")
                             .set_dim({B, NH, T, HS})
+                            .set_uid(dO_UID)
                             .set_stride({NH * HS * T, HS, NH * HS, 1}));
 
     auto stats = graph->tensor(fe::graph::Tensor_attributes().set_name("stats")
                             .set_dim({B, NH, T, 1})
+                            .set_uid(Stats_UID)
                             .set_stride({NH * T, T, 1, 1})
                             .set_data_type(fe::DataType_t::FLOAT));
     auto attn_scale = graph->tensor(fe::graph::Tensor_attributes().set_name("attn_scale")
                             .set_dim({1, 1, 1, 1})
                             .set_stride({1, 1, 1, 1})
                             .set_is_pass_by_value(true)
+                            .set_uid(Attn_scale_UID)
                             .set_data_type(fe::DataType_t::FLOAT));
     auto sdpa_backward_options = fe::graph::SDPA_backward_attributes().set_name("flash_attention_backward")
                             .set_causal_mask(true)
@@ -194,16 +211,11 @@ auto lookup_cache_or_build_graph_bwd(Args... args) {
     // Create the graph operation and get the output tensors back
     auto [dQ, dK, dV] = graph->sdpa_backward(Q, K, V, O, dO, stats, sdpa_backward_options);
 
-    dQ->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1});
-    dK->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1});
-    dV->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1});
+    dQ->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}).set_uid(dQ_UID);
+    dK->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}).set_uid(dK_UID);
+    dV->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}).set_uid(dV_UID);
 
     checkCudnnFE(graph->validate());
-    auto key = graph->key();
-    auto it = user_maintained_cache_bwd.find(key);
-    if (it != user_maintained_cache_bwd.end()) {
-        return it->second;
-    }
 
     // Build the operation graph and execution part (this is the VERY SLOW PART)
     checkCudnnFE(graph->build_operation_graph(cudnn_handle));
@@ -221,9 +233,8 @@ auto lookup_cache_or_build_graph_bwd(Args... args) {
         cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size));
     }
 
-    auto tuple = std::make_tuple(graph, Q, K, V, O, dO, stats, attn_scale, dQ, dK, dV);
-    user_maintained_cache_bwd.insert({key, tuple});
-    return tuple;
+    user_maintained_cache_bwd.insert({key, graph});
+    return graph;
 }
 
 void attention_forward_cudnn(floatX* out,  // output: (B, T, NH, HS)
@@ -235,8 +246,7 @@ void attention_forward_cudnn(floatX* out,  // output: (B, T, NH, HS)
     bool is_inference_only = (stats == nullptr);
 
     // Get graph and tensors from cache (or generate it on first use)
-    auto [graph, Q, K, V, attn_scale, O, softmax_stats] =
-        lookup_cache_or_build_graph_fwd(B, NH, T, HS, is_inference_only);
+    auto graph = lookup_cache_or_build_graph_fwd(B, NH, T, HS, is_inference_only);
 
     // Prepare all the tensor pointers for executing the graph
     void* devPtrQ = inp;
@@ -246,12 +256,12 @@ void attention_forward_cudnn(floatX* out,  // output: (B, T, NH, HS)
     void* devPtrO = out;
 
     // Build variant pack
-    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
-        {Q, devPtrQ}, {K, devPtrK}, {V, devPtrV}, {attn_scale, &attn_scale_cpu}, {O, devPtrO}};
+    std::unordered_map<int64_t , void*> variant_pack = {
+        {Q_UID, devPtrQ}, {K_UID, devPtrK}, {V_UID, devPtrV}, {Attn_scale_UID, &attn_scale_cpu}, {O_UID, devPtrO}};
 
     // Add the stats tensor unless we are only doing inference (only needed for backward pass)
     if (is_inference_only == false) {
-        variant_pack[softmax_stats] = stats;
+        variant_pack[Stats_UID] = stats;
     }
 
     // Execute graph
@@ -266,8 +276,7 @@ void attention_backward_cudnn(floatX* dqkvr,
     int HS = C / NH; // number of features per head
 
     // Get graph and tensors from cache (or generate it on first use)
-    auto [graph, Q, K, V, O, dO, Stats, attn_scale, dQ, dK, dV] =
-        lookup_cache_or_build_graph_bwd(B, NH, T, HS);
+    auto graph = lookup_cache_or_build_graph_bwd(B, NH, T, HS);
 
     // Prepare all the tensor pointers for executing the graph
     void* devPtrQ = qkvr;
@@ -283,10 +292,10 @@ void attention_backward_cudnn(floatX* dqkvr,
     void* devPtrdV = (dqkvr + 2 * NH * HS);
 
     // Build variant pack that links each tensor to its data pointer
-    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
-        {Q, devPtrQ}, {K, devPtrK}, {V, devPtrV}, {O, devPtrO}, {dO, devPtrdO}, {Stats, devPtrStats},
-        {dQ, devPtrdQ}, {dK, devPtrdK}, {dV, devPtrdV},
-        {attn_scale, &attn_scale_cpu}};
+    std::unordered_map<int64_t, void*> variant_pack = {
+        {Q_UID, devPtrQ}, {K_UID, devPtrK}, {V_UID, devPtrV}, {O_UID, devPtrO}, {dO_UID, devPtrdO}, {Stats_UID, devPtrStats},
+        {dQ_UID, devPtrdQ}, {dK_UID, devPtrdK}, {dV_UID, devPtrdV},
+        {Attn_scale_UID, &attn_scale_cpu}};
 
     // Execute graph
     checkCudnnFE(graph->execute(cudnn_handle, variant_pack, cudnn_workspace));

From 691c1df969bf893054731613f7e3ae9299aed86d Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Thu, 9 May 2024 23:48:11 +0200
Subject: [PATCH 048/172] fused layernorm+residual

---
 dev/cuda/Makefile                  |   3 +-
 dev/cuda/attention_forward.cu      |   8 -
 dev/cuda/classifier_fused.cu       |   8 -
 dev/cuda/common.h                  |   7 +
 dev/cuda/fused_residual_forward.cu | 695 +++++++++++++++++++++++++++++
 dev/cuda/softmax_forward.cu        |   8 -
 train_gpt2.cu                      | 145 +++++-
 7 files changed, 833 insertions(+), 41 deletions(-)
 create mode 100644 dev/cuda/fused_residual_forward.cu

diff --git a/dev/cuda/Makefile b/dev/cuda/Makefile
index 834a98b0f..c74178851 100644
--- a/dev/cuda/Makefile
+++ b/dev/cuda/Makefile
@@ -18,7 +18,7 @@ MPI_PATHS = -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-
 	$(NVCC) $(CFLAGS) $(NVCCFLAGS) $< -o $@
 
 # Build all targets
-TARGETS = adamw attention_backward attention_forward classifier_fused crossentropy_forward crossentropy_softmax_backward encoder_backward encoder_forward gelu_backward gelu_forward layernorm_backward layernorm_forward matmul_backward matmul_backward_bias matmul_forward nccl_all_reduce residual_forward softmax_forward trimat_forward
+TARGETS = adamw attention_backward attention_forward classifier_fused crossentropy_forward crossentropy_softmax_backward encoder_backward encoder_forward gelu_backward gelu_forward layernorm_backward layernorm_forward matmul_backward matmul_backward_bias matmul_forward nccl_all_reduce residual_forward softmax_forward trimat_forward fused_residual_forward
 all: $(TARGETS)
 
 # Individual targets: forward pass
@@ -28,6 +28,7 @@ crossentropy_forward: crossentropy_forward.cu
 encoder_forward: encoder_forward.cu
 gelu_forward: gelu_forward.cu
 layernorm_forward: layernorm_forward.cu
+fused_residual_forward: fused_residual_forward.cu
 residual_forward: residual_forward.cu
 softmax_forward: softmax_forward.cu
 trimat_forward: trimat_forward.cu
diff --git a/dev/cuda/attention_forward.cu b/dev/cuda/attention_forward.cu
index 66cfa629b..b22b3f132 100644
--- a/dev/cuda/attention_forward.cu
+++ b/dev/cuda/attention_forward.cu
@@ -240,14 +240,6 @@ __device__ float warpReduceMax(float val) {
     return val;
 }
 
-// warp-level reduction for summing values
-__device__ float warpReduceSum(float val) {
-    for (int offset = 16; offset > 0; offset /= 2) {
-        val += __shfl_down_sync(0xFFFFFFFF, val, offset);
-    }
-    return val;
-}
-
 __global__ void softmax_forward_kernel4(float* out, const float* inp, int N, int C) {
     // out is (N, C) just like inp. Each row of inp will get softmaxed.
     // same as kernel3, but can handle any block size (multiple of 32)
diff --git a/dev/cuda/classifier_fused.cu b/dev/cuda/classifier_fused.cu
index c44727f73..092de5955 100644
--- a/dev/cuda/classifier_fused.cu
+++ b/dev/cuda/classifier_fused.cu
@@ -98,14 +98,6 @@ void crossentropy_softmax_backward_cpu(float* dlogits,
 // ----------------------------------------------------
 // Kernel Utils
 
-// warp-level reduction for summing values
-__device__ float warpReduceSum(float val) {
-    for (int offset = 16; offset > 0; offset /= 2) {
-        val += __shfl_xor_sync(0xFFFFFFFF, val, offset);
-    }
-    return val;
-}
-
 // warp-level reduction for finding the maximum value
 __device__ float warpReduceMax(float val) {
     for (int offset = 16; offset > 0; offset /= 2) {
diff --git a/dev/cuda/common.h b/dev/cuda/common.h
index 788a8f505..5da54fe1d 100644
--- a/dev/cuda/common.h
+++ b/dev/cuda/common.h
@@ -10,6 +10,13 @@ __host__ __device__ T ceil_div(T dividend, T divisor) {
     return (dividend + divisor-1) / divisor;
 }
 
+__device__ float warpReduceSum(float val) {
+    for (int offset = 16; offset > 0; offset /= 2) {
+        val += __shfl_xor_sync(0xFFFFFFFF, val, offset);
+    }
+    return val;
+}
+
 // ----------------------------------------------------------------------------
 // checking utils
 
diff --git a/dev/cuda/fused_residual_forward.cu b/dev/cuda/fused_residual_forward.cu
new file mode 100644
index 000000000..f228503af
--- /dev/null
+++ b/dev/cuda/fused_residual_forward.cu
@@ -0,0 +1,695 @@
+/*
+Kernels for residual forward pass fused with layernorm
+
+Compile example:
+nvcc -O3 --use_fast_math fused_residual_forward.cu -o fused_residual_forward
+
+version 1 is naive port from CPU code to kernel
+./fused_residual_forward 1
+version 2 packs input into 128 bit memory reads
+./fused_residual_forward 2
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "assert.h"
+#include <cuda_runtime.h>
+
+#define ENABLE_BF16
+#include "common.h"
+
+// ----------------------------------------------------------------------------
+// CPU code reference lol
+
+void residual_forward_cpu(float* out, const float* inp1, const float* inp2, int N) {
+    for (int i = 0; i < N; i++) {
+        out[i] = inp1[i] + inp2[i];
+    }
+}
+
+void layernorm_forward_cpu(float* out, float* mean, float* rstd,
+                           const float* inp, const float* weight, const float* bias,
+                           int B, int T, int C) {
+    float eps = 1e-5f;
+    for (int b = 0; b < B; b++) {
+        for (int t = 0; t < T; t++) {
+            // seek to the input position inp[b,t,:]
+            const float* x = inp + b * T * C + t * C;
+            // calculate the mean
+            float m = 0.0f;
+            for (int i = 0; i < C; i++) {
+                m += x[i];
+            }
+            m = m/C;
+            // calculate the variance (without any bias correction)
+            float v = 0.0f;
+            for (int i = 0; i < C; i++) {
+                float xshift = x[i] - m;
+                v += xshift * xshift;
+            }
+            v = v/C;
+            // calculate the rstd
+            float s = 1.0f / sqrtf(v + eps);
+            // seek to the output position in out[b,t,:]
+            float* out_bt = out + b * T * C + t * C;
+            for (int i = 0; i < C; i++) {
+                float n = (s * (x[i] - m)); // normalized output
+                float o = n * weight[i] + bias[i]; // scale and shift it
+                out_bt[i] = o; // write
+            }
+            // cache the mean and rstd for the backward pass later
+            mean[b * T + t] = m;
+            rstd[b * T + t] = s;
+        }
+    }
+}
+
+// ----------------------------------------------------------------------------
+// GPU kernels
+
+// elementwise ops are nice and ez
+__global__ void residual_forward_kernel1(floatX* out, const floatX* inp1, const floatX* inp2, int N) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < N) {
+        out[idx] = (floatX)((float)inp1[idx] + (float)inp2[idx]);
+    }
+}
+
+// naive drag and drop implementation into kernel, parallelize over B,T, loop over C
+__global__ void layernorm_forward_kernel1(floatX* out, floatX* mean, floatX* rstd,
+                                          const floatX* inp, const floatX* weight, const floatX* bias,
+                                          int N, int C) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    float eps = 1e-5f;
+
+    if (idx < N) {
+        // seek to the input position inp[idx,:]
+        const floatX* x = inp + idx * C;
+        // calculate the mean
+        float m = 0.0f;
+        for (int i = 0; i < C; i++) {
+            m += (float)x[i];
+        }
+        m = m / C;
+        // calculate the variance (without any bias correction)
+        float v = 0.0f;
+        for (int i = 0; i < C; i++) {
+            float xshift = (float)x[i] - m;
+            v += xshift * xshift;
+        }
+        v = v / C;
+        // calculate the rstd
+        float s = 1.0f / sqrtf(v + eps);
+        // seek to the output position in out[idx,:]
+        floatX* out_idx = out + idx * C;
+        for (int i = 0; i < C; i++) {
+            float n = (s * ((float)x[i] - m)); // normalized output
+            float o = n * (float)weight[i] + (float)bias[i]; // scale and shift it
+            out_idx[i] = o; // write
+        }
+        // cache the mean and rstd for the backward pass later
+        mean[idx] = m;
+        rstd[idx] = s;
+    }
+}
+
+// naive fusion; uncoalesced access pattern leads to terrible performance
+__global__ void fused_residual_forward2(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                                        const floatX* inp1, const floatX* inp2,
+                                        const floatX* weight, const floatX* bias,
+                                        int N, int C) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if(idx > N) return;
+
+    // adjust pointers to current token
+    residual += C * idx;
+    normed += C * idx;
+    inp1 += C * idx;
+    inp2 += C * idx;
+
+    float eps = 1e-5f;
+
+    float m = 0.0f;
+    for(int c = 0; c < C; ++c) {
+        float out = (float)inp1[c] + (float)inp2[c];
+        m += out;
+        residual[c] = out;
+    }
+
+    m = m / C;
+    float v = 0.0f;
+    for (int c = 0; c < C; c++) {
+        float xshift = (float)residual[c] - m;
+        v += xshift * xshift;
+    }
+    v = v / C;
+
+    // calculate the rstd
+    float s = 1.0f / sqrtf(v + eps);
+    for (int c = 0; c < C; c++) {
+        float n = (s * ((float)residual[c] - m)); // normalized output
+        float o = n * (float)weight[c] + (float)bias[c]; // scale and shift it
+        normed[c] = o; // write
+    }
+    // cache the mean and rstd for the backward pass later
+    mean[idx] = m;
+    rstd[idx] = s;
+}
+
+// handle one token per warp for coalesced access
+__global__ void fused_residual_forward3(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                                        const floatX* inp1, const floatX* inp2,
+                                        const floatX* weight, const floatX* bias,
+                                        int N, int C) {
+    constexpr const int WarpSize = 32;
+    assert(blockDim.x == WarpSize);
+    int idx = blockIdx.x * blockDim.y + threadIdx.y;
+    if(idx > N) return;
+
+    // adjust pointers to current token
+    residual += C * idx;
+    normed += C * idx;
+    inp1 += C * idx;
+    inp2 += C * idx;
+
+    float eps = 1e-5f;
+    float m = 0.0f;
+    for(int c = threadIdx.x; c < C; c += WarpSize) {
+        float out = (float)inp1[c] + (float)inp2[c];
+        m += out;
+        residual[c] = out;
+    }
+
+    m = warpReduceSum(m);
+
+    m = m / C;
+    float v = 0.0f;
+    for(int c = threadIdx.x; c < C; c += WarpSize) {
+        float xshift = (float)residual[c] - m;
+        v += xshift * xshift;
+    }
+
+    v = warpReduceSum(v);
+    v = v / C;
+
+    // calculate the rstd
+    float s = 1.0f / sqrtf(v + eps);
+    for(int c = threadIdx.x; c < C; c += WarpSize) {
+        float n = (s * ((float)residual[c] - m)); // normalized output
+        float o = n * (float)weight[c] + (float)bias[c]; // scale and shift it
+        normed[c] = o; // write
+    }
+    // cache the mean and rstd for the backward pass later
+    if(threadIdx.x == 0) {
+        mean[idx] = m;
+        rstd[idx] = s;
+    }
+}
+
+// vectorized loading, single pass stats, streaming access and zigzag loop
+__global__ void fused_residual_forward_kernel4(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                                               const floatX* inp1, const floatX* inp2,
+                                               const floatX* weight, const floatX* bias,
+                                               int N, int C) {
+    using x128 = Packed128<floatX>;
+    constexpr const int WarpSize = 32;
+    assert(blockDim.x == WarpSize);
+    int idx = blockIdx.x * blockDim.y + threadIdx.y;
+    if(idx > N) return;
+
+    // adjust pointers to current token
+    residual += C * idx;
+    normed += C * idx;
+    inp1 += C * idx;
+    inp2 += C * idx;
+
+    const float eps = 1e-5f;
+    float sum = 0.0f;
+    float sum_sq = 0.0f;
+    int c = threadIdx.x * x128::size;
+    for(; c < C; c += WarpSize * x128::size) {
+        const x128 in1 = load128cs(inp1 + c);
+        const x128 in2 = load128cs(inp2 + c);
+        x128 out;
+        for(int k = 0; k < x128::size; ++k) {
+            out[k] = (float)in1[k] + (float)in2[k];
+            sum += (float)out[k];
+            sum_sq += (float)out[k] * (float)out[k];
+        }
+        store128(residual + c, out);
+    }
+
+    sum = warpReduceSum(sum);
+    sum_sq = warpReduceSum(sum_sq);
+
+    float m = sum / C;
+    float v = sum_sq / C - m * m;
+    float s = rsqrtf(v + eps);
+
+    c -= WarpSize * x128::size;
+    for(; c >= 0; c -= WarpSize * x128::size) {
+        const x128 res = load128cs(residual + c);
+        const x128 w = load128(weight + c);
+        const x128 b = load128(bias + c);
+        x128 out;
+        for(int k = 0; k < x128::size; ++k) {
+            float n = s * ((float)res[k] - m); // normalized output
+            float o = n * (float)w[k] + (float)b[k]; // scale and shift it
+            out[k] = o;
+        }
+
+        store128cs(normed + c, out);
+    }
+    // cache the mean and rstd for the backward pass later
+    if(threadIdx.x == 0) {
+        mean[idx] = m;
+        rstd[idx] = s;
+    }
+}
+
+// what do you want in shared memory? EVERYTHING!
+// thus, we no longer require zigzag loops and can do the numerically more stable variance estimation
+// needs special attention in the kernel launcher to ensure we have enough smem.
+__global__ void fused_residual_forward_kernel5(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                                               const floatX* inp1, const floatX* inp2,
+                                               const floatX* weight, const floatX* bias,
+                                               int N, int C) {
+    constexpr const int WarpSize = 32;
+    assert(blockDim.x == WarpSize);
+
+    // load weights and biases into shared memory
+    // do this before we allow any threads to exit!
+    extern __shared__ char params[];
+    // load128/store128 sometimes generated multiple instructions when the types here were floatX*, so
+    // let's keep everything as x128
+    x128* s_weight = reinterpret_cast<x128*>(params);
+    x128* s_bias = reinterpret_cast<x128*>(params) + (C / x128::size);
+    x128* s_res = reinterpret_cast<x128*>(params) + ((2 + threadIdx.y) * C / x128::size);
+
+    int sidx = (threadIdx.x + WarpSize * threadIdx.y) * x128::size;
+    for(int i = sidx; i < C; i += blockDim.y * WarpSize * x128::size) {
+        s_weight[i/x128::size] = load128(weight + i);
+        s_bias[i/x128::size] = load128(bias + i);
+    }
+    __syncthreads();
+
+    int idx = blockIdx.x * blockDim.y + threadIdx.y;
+    if(idx > N) return;
+
+    // adjust pointers to current token
+    residual += C * idx;
+    normed += C * idx;
+    inp1 += C * idx;
+    inp2 += C * idx;
+
+    const float eps = 1e-5f;
+    float sum = 0.0f;
+    for(int c = threadIdx.x * x128::size; c < C; c += WarpSize * x128::size) {
+        const x128 in1 = load128cs(inp1 + c);
+        const x128 in2 = load128cs(inp2 + c);
+        x128 out;
+        for(int k = 0; k < x128::size; ++k) {
+            out[k] = (float)in1[k] + (float)in2[k];
+            sum += (float)out[k];
+        }
+        store128cs(residual + c, out);
+        s_res[c / x128::size] = out;
+    }
+
+    sum = warpReduceSum(sum);
+    float m = sum / C;
+    float v = 0.f;
+
+    for(int c = threadIdx.x * x128::size; c < C; c += WarpSize * x128::size) {
+        const x128 res = s_res[c / x128::size];
+        for(int k = 0; k < x128::size; ++k) {
+            v += ((float)res[k] - m) * ((float)res[k] - m);
+        }
+    }
+
+    v = warpReduceSum(v) / C;
+    float s = rsqrtf(v + eps);
+
+    for(int c = threadIdx.x * x128::size; c < C; c += WarpSize * x128::size) {
+        const x128 res = s_res[c / x128::size];
+        const x128 w = s_weight[c / x128::size];
+        const x128 b = s_bias[c / x128::size];
+        x128 out;
+        for(int k = 0; k < x128::size; ++k) {
+            float n = s * ((float)res[k] - m); // normalized output
+            float o = n * (float)w[k] + (float)b[k]; // scale and shift it
+            out[k] = o;
+        }
+
+        store128cs(normed + c, out);
+    }
+    // cache the mean and rstd for the backward pass later
+    if(threadIdx.x == 0) {
+        mean[idx] = m;
+        rstd[idx] = s;
+    }
+}
+
+
+// using multiple warps per token, and keep threads persistent, so we never have to reload weights and biases
+// if we had one warp per token, though, this would require us to use a huge amount of shared memory. Therefore,
+// we use multiple warps per token; but generally we cannot use the entire block, because that would give too
+// little work per warp to be effective (each warp processes 256 bfloat16 elements, so for C=768 more than 3 warps
+// will just mean idle). Therefore, we add a z dimension, where warps with different z handle different tokens.
+// all this makes the launcher logic more complicated :(
+__global__ void fused_residual_forward_kernel6(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                                               const floatX* inp1, const floatX* inp2,
+                                               const floatX* weight, const floatX* bias,
+                                               int N, int C) {
+    constexpr const int WarpSize = 32;
+    assert(blockDim.x == WarpSize);
+
+    // load weights and biases into shared memory
+    // do this before we allow any threads to exit!
+    extern __shared__ char params[];
+    // load128/store128 sometimes generated multiple instructions when the types here were floatX*, so
+    // let's keep everything as x128
+    // weights and biases are  shared among all tokens
+    x128* s_weight = reinterpret_cast<x128*>(params);
+    x128* s_bias = reinterpret_cast<x128*>(params + C * sizeof(floatX));
+    // residual output (input to layernorm) is indpendent for each sub-block indicates by threadIdx.z
+    x128* s_res = reinterpret_cast<x128*>(params + (2 + threadIdx.z) * C * sizeof(floatX)  );
+    // similarly, each sub-block needs its own reduction buffers
+    float* s_mean = reinterpret_cast<float*>(params + (2 + blockDim.z) * C * sizeof(floatX) + threadIdx.z * 32 * sizeof(float));
+    float* s_var = reinterpret_cast<float*>(params + (2 + blockDim.z) * C * sizeof(floatX) + 32 * sizeof(float) * (blockDim.z + threadIdx.z));
+
+    int cidx = (threadIdx.x + WarpSize * threadIdx.y) * x128::size;
+    int step = blockDim.y * WarpSize * x128::size;
+
+    for(int c = cidx; c < C; c += step) {
+        s_weight[c / x128::size] = load128(weight + c);
+        s_bias[c / x128::size] = load128(bias + c);
+    }
+    // the block-level reductions will cause sync before the first time we read these
+    // => no syncthreads needed here
+
+
+    // loop over all tokens
+    for(int tidx = blockIdx.x * blockDim.z + threadIdx.z; tidx < N; tidx += gridDim.x * blockDim.z) {
+        // adjust pointers to current token
+        floatX* residual_bt = residual + C * tidx;
+        floatX* normed_bt = normed + C * tidx;
+        const floatX* inp1_bt = inp1 + C * tidx;
+        const floatX* inp2_bt = inp2 + C * tidx;
+
+        const float eps = 1e-5f;
+        float sum = 0.0f;
+        for (int c = cidx; c < C; c += step) {
+            const x128 in1 = load128cs(inp1_bt + c);
+            const x128 in2 = load128cs(inp2_bt + c);
+            x128 out;
+            for (int k = 0; k < x128::size; ++k) {
+                out[k] = (float) in1[k] + (float) in2[k];
+                sum += (float) out[k];
+            }
+            store128cs(residual_bt + c, out);
+            s_res[c / x128::size] = out;
+        }
+        sum = warpReduceSum(sum);
+        if(threadIdx.x == 0) {
+            s_mean[threadIdx.y] = sum;
+        }
+        __syncthreads();
+        float m = warpReduceSum(threadIdx.x < blockDim.y ? s_mean[threadIdx.x] : 0.f) / C;
+        // normally, we'd syncthread here to make sure that no warp is already at the next
+        // iteration of the loop, messing with s_mean. The fact that we interleave s_mean and s_var means
+        // we don't need these additional syncs.
+        float v = 0.f;
+
+        for (int c = cidx; c < C; c += step) {
+            const x128 res = s_res[c / x128::size];
+            for (int k = 0; k < x128::size; ++k) {
+                v += ((float) res[k] - m) * ((float) res[k] - m);
+            }
+        }
+
+        v = warpReduceSum(v);
+        if(threadIdx.x == 0) {
+            s_var[threadIdx.y] = v;
+        }
+        __syncthreads();
+        v = warpReduceSum(threadIdx.x < blockDim.y ? s_var[threadIdx.x] : 0.f) / C;
+        float s = rsqrtf(v + eps);
+
+        for (int c = cidx; c < C; c += step) {
+            const x128 res = s_res[c / x128::size];
+            const x128 w = s_weight[c / x128::size];
+            const x128 b = s_bias[c / x128::size];
+            x128 out;
+            for (int k = 0; k < x128::size; ++k) {
+                float n = s * ((float) res[k] - m); // normalized output
+                float o = n * (float) w[k] + (float) b[k]; // scale and shift it
+                out[k] = o;
+            }
+
+            store128(normed_bt + c, out);
+        }
+        // cache the mean and rstd for the backward pass later
+        if (threadIdx.x == 0 && threadIdx.y == 0) {
+            mean[tidx] = m;
+            rstd[tidx] = s;
+        }
+    }
+}
+
+
+
+// ----------------------------------------------------------------------------
+// kernel launcher
+
+void fused_residual_forward1(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                             const floatX* inp1, const floatX* inp2,
+                             const floatX* weight, const floatX* bias,
+                             int N, int C, const int block_size) {
+    const int grid_size_resid = ceil_div(N * C, block_size);
+    residual_forward_kernel1<<<grid_size_resid, block_size>>>(residual, inp1, inp2, N*C);
+    cudaCheck(cudaGetLastError());
+    const int grid_size_ln = ceil_div(N, block_size);
+    layernorm_forward_kernel1<<<grid_size_ln, block_size>>>(normed, mean, rstd, residual, weight, bias, N, C);
+    cudaCheck(cudaGetLastError());
+}
+
+void fused_residual_forward2(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                             const floatX* inp1, const floatX* inp2,
+                             const floatX* weight, const floatX* bias,
+                             int N, int C, const int block_size) {
+    const int grid_size = ceil_div(N, (int)(block_size));
+    fused_residual_forward2<<<grid_size, block_size>>>(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C);
+    cudaCheck(cudaGetLastError());
+}
+
+void fused_residual_forward3(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                             const floatX* inp1, const floatX* inp2,
+                             const floatX* weight, const floatX* bias,
+                             int N, int C, const int block_size) {
+    int block_y = block_size / 32;
+    const int grid_size = ceil_div(N, block_y);
+    fused_residual_forward3<<<grid_size, dim3(32, block_y)>>>(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C);
+    cudaCheck(cudaGetLastError());
+}
+
+void fused_residual_forward4(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                             const floatX* inp1, const floatX* inp2,
+                             const floatX* weight, const floatX* bias,
+                             int N, int C, const int block_size) {
+    int block_y = block_size / 32;
+    const int grid_size = ceil_div(N, block_y);
+    fused_residual_forward_kernel4<<<grid_size, dim3(32, block_y)>>>(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C);
+    cudaCheck(cudaGetLastError());
+}
+
+void fused_residual_forward5(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                             const floatX* inp1, const floatX* inp2,
+                             const floatX* weight, const floatX* bias,
+                             int N, int C, const int block_size) {
+    int block_y = block_size / 32;
+    const int grid_size = ceil_div(N, block_y);
+    size_t smem = (2 + block_y) * C * sizeof(floatX);
+
+    // in order to use more than 48 KiB of smem, need to call cudaFuncSetAttribute
+    // this may fail, in which case we fall back to the smem free implementation.
+    cudaCheck(cudaGetLastError());
+    auto status = cudaFuncSetAttribute(fused_residual_forward_kernel5, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
+    cudaGetLastError();
+    if(status == cudaSuccess) {
+        fused_residual_forward_kernel5<<<grid_size, dim3(32, block_y), smem>>>(residual, normed, mean, rstd, inp1, inp2,
+                                                                               weight, bias, N, C);
+    } else {
+        fused_residual_forward_kernel4<<<grid_size, dim3(32, block_y)>>>(residual, normed, mean, rstd, inp1, inp2,
+                                                                         weight, bias, N, C);
+    }
+    cudaCheck(cudaGetLastError());
+}
+
+void fused_residual_forward6(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                             const floatX* inp1, const floatX* inp2,
+                             const floatX* weight, const floatX* bias,
+                             int N, int C, const int block_size) {
+    int warps_per_token = max(1, C / Packed128<floatX>::size / 32);
+    int total_warps = block_size / 32;
+    int block_z = max(1, total_warps / warps_per_token);
+    int block_y = max(1, total_warps / block_z);
+    size_t smem = (2 + block_z) * C * sizeof(floatX) + 64 * sizeof(float) * block_z;
+
+    // in order to use more than 48 KiB of smem, need to call cudaFuncSetAttribute
+    // this may fail, in which case we fall back to the smem free implementation.
+    cudaCheck(cudaGetLastError());
+    auto status = cudaFuncSetAttribute(fused_residual_forward_kernel6, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
+    cudaGetLastError();
+    if(status == cudaSuccess) {
+        const int num_blocks = max(1, cuda_threads_per_SM * cuda_num_SMs / block_size);
+        fused_residual_forward_kernel6<<<num_blocks, dim3(32, block_y, block_z), smem>>>(residual, normed, mean, rstd, inp1, inp2,
+                                                                               weight, bias, N, C);
+    } else {
+        const int grid_size = ceil_div(N, total_warps);
+        fused_residual_forward_kernel4<<<grid_size, dim3(32, total_warps)>>>(residual, normed, mean, rstd, inp1, inp2,
+                                                                         weight, bias, N, C);
+    }
+    cudaCheck(cudaGetLastError());
+}
+
+// kernel version dispatch
+void fused_residual_forward(int kernel_num, floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                            const floatX* inp1, const floatX* inp2,
+                            const floatX* weight, const floatX* bias,
+                            int N, int C, const int block_size) {
+    switch (kernel_num) {
+        case 1:
+            fused_residual_forward1(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C, block_size);
+            break;
+        case 2:
+            fused_residual_forward2(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C, block_size);
+            break;
+        case 3:
+            fused_residual_forward3(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C, block_size);
+            break;
+        case 4:
+            fused_residual_forward4(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C, block_size);
+            break;
+        case 5:
+            fused_residual_forward5(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C, block_size);
+            break;
+        case 6:
+            fused_residual_forward6(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C, block_size);
+            break;
+        default:
+            printf("Invalid kernel number\n");
+            exit(1);
+    }
+}
+
+// ----------------------------------------------------------------------------
+
+int main(int argc, const char **argv) {
+    setup_main();
+
+    int B = 8;
+    int T = 1024;
+    int C = 768;
+
+    // read kernel_num from command line
+    int kernel_num = 1;
+    if (argc > 1) {
+        kernel_num = atoi(argv[1]);
+    }
+    printf("Using kernel %d\n", kernel_num);
+
+    // create host memory of random numbers
+    float* residual = (float*)malloc(B * T * C * sizeof(float));
+    float* normed = (float*)malloc(B * T * C * sizeof(float));
+    float* inp1 = make_random_float(B * T * C);
+    float* inp2 = make_random_float(B * T * C);
+    float* mean = (float*)malloc(B * T * sizeof(float));
+    float* rstd = (float*)malloc(B * T * sizeof(float));
+    float* weight = make_random_float(C);
+    float* bias = make_random_float(C);
+    
+    // move to GPU
+    floatX* d_residual;
+    floatX* d_normed;
+    floatX* d_inp1;
+    floatX* d_inp2;
+    floatX* d_mean;
+    floatX* d_rstd;
+    floatX* d_weight;
+    floatX* d_bias;
+    cudaCheck(cudaMalloc(&d_residual, B * T * C * sizeof(floatX)));
+    cudaCheck(cudaMalloc(&d_normed, B * T * C * sizeof(floatX)));
+    cudaCheck(cudaMalloc(&d_inp1, B * T * C * sizeof(floatX)));
+    cudaCheck(cudaMalloc(&d_inp2, B * T * C * sizeof(floatX)));
+    cudaCheck(cudaMalloc(&d_mean, B * T * sizeof(float)));
+    cudaCheck(cudaMalloc(&d_rstd, B * T * sizeof(float)));
+    cudaCheck(cudaMalloc(&d_weight, C * sizeof(float)));
+    cudaCheck(cudaMalloc(&d_bias, C * sizeof(float)));
+    cudaCheck(memcpy_convert(d_inp1, inp1, B * T * C));
+    cudaCheck(memcpy_convert(d_inp2, inp2, B * T * C));
+    cudaCheck(memcpy_convert(d_weight, weight, C));
+    cudaCheck(memcpy_convert(d_bias, bias, C));
+
+    // first check the correctness of the kernel
+    residual_forward_cpu(residual, inp1, inp2, B * T * C);
+    layernorm_forward_cpu(normed, mean, rstd, residual, weight, bias, B, T, C);
+
+    // time the kernel at different block sizes
+    int block_sizes[] = {32, 64, 128, 256, 512, 1024};
+
+    for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
+        int block_size = block_sizes[j];
+        printf("Checking block size %d.\n", block_size);
+        cudaCheck(cudaMemset(d_residual, 0, B * T * C * sizeof(floatX)));
+        fused_residual_forward(kernel_num, d_residual, d_normed, d_mean, d_rstd, d_inp1, d_inp2, d_weight, d_bias,
+                               B*T, C, block_size);
+        float tol = std::is_same_v<floatX, float> ? 1e-5 : 5e-2;
+        validate_result(d_residual, residual, "residual", B * T * C, tol);
+        validate_result(d_mean, mean, "mean", B * T, tol);
+        validate_result(d_rstd, rstd, "rstd", B * T, tol);
+        validate_result(d_normed, normed, "normed", B * T * C, tol);
+    }
+
+    printf("All results match. Starting benchmarks.\n\n");
+
+    for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
+        int block_size = block_sizes[j];
+
+        int repeat_times = 1000;
+        float elapsed_time = benchmark_kernel(repeat_times, fused_residual_forward, kernel_num,
+                                              d_residual, d_normed, d_mean, d_rstd, d_inp1, d_inp2, d_weight, d_bias,
+                                              B*T, C, block_size
+                                              );
+
+        // napkin math: estimate the memory bandwidth achieved
+        // for each (B,T,C) output element, we do 2 reads and 2 writes, plus 2 BT writes for mean/rstd
+        // and e.g. A100 40GB PCIe is advertised at 1,555GB/s
+        long memory_ops = B * T * (C * 4 + 2) * sizeof(floatX);
+        float memory_bandwidth = memory_ops / elapsed_time / 1e6;
+        float toks_per_msec = B * T / elapsed_time / 1e3;
+
+        printf("block_size %4d | time %.4f ms | bandwidth %.2f GB/s | elements: %.2f ktok/ms\n",
+               block_size, elapsed_time, memory_bandwidth, toks_per_msec);
+    }
+
+    // free memory
+    free(residual);
+    free(normed);
+    free(mean);
+    free(rstd);
+    free(weight);
+    free(bias);
+    free(inp1);
+    free(inp2);
+    cudaCheck(cudaFree(d_residual));
+    cudaCheck(cudaFree(d_normed));
+    cudaCheck(cudaFree(d_mean));
+    cudaCheck(cudaFree(d_rstd));
+    cudaCheck(cudaFree(d_weight));
+    cudaCheck(cudaFree(d_bias));
+    cudaCheck(cudaFree(d_inp1));
+    cudaCheck(cudaFree(d_inp2));
+
+    return 0;
+}
diff --git a/dev/cuda/softmax_forward.cu b/dev/cuda/softmax_forward.cu
index f611864f0..d0d38850d 100644
--- a/dev/cuda/softmax_forward.cu
+++ b/dev/cuda/softmax_forward.cu
@@ -182,14 +182,6 @@ __device__ float warpReduceMax(float val) {
     return val;
 }
 
-// warp-level reduction for summing values
-__device__ float warpReduceSum(float val) {
-    for (int offset = 16; offset > 0; offset /= 2) {
-        val += __shfl_down_sync(0xFFFFFFFF, val, offset);
-    }
-    return val;
-}
-
 __global__ void softmax_forward_kernel3(float* out, const float* inp, int N, int C) {
     // kernel must use block size of 32
     extern __shared__ float shared[];
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 16ff756ce..fdc4968d2 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -596,6 +596,87 @@ __global__ void layernorm_forward_kernel3(floatX* __restrict__ out, floatX* __re
     }
 }
 
+__global__ void fused_residual_forward_kernel5(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                                               const floatX* inp1, const floatX* inp2,
+                                               const floatX* weight, const floatX* bias,
+                                               int N, int C) {
+    constexpr const int WarpSize = 32;
+    assert(blockDim.x == WarpSize);
+
+    // load weights and biases into shared memory
+    // do this before we allow any threads to exit!
+    extern __shared__ char* params[];
+    // load128/store128 sometimes generated multiple instructions when the types here were floatX*, so
+    // let's keep everything as x128
+    x128* s_weight = reinterpret_cast<x128*>(params);
+    x128* s_bias = reinterpret_cast<x128*>(params) + (C / x128::size);
+    x128* s_res = reinterpret_cast<x128*>(params) + ((2 + threadIdx.y) * C / x128::size);
+
+    int sidx = (threadIdx.x + WarpSize * threadIdx.y) * x128::size;
+    for(int i = sidx; i < C; i += blockDim.y * WarpSize * x128::size) {
+        s_weight[i/x128::size] = load128(weight + i);
+        s_bias[i/x128::size] = load128(bias + i);
+    }
+    __syncthreads();
+
+    int idx = blockIdx.x * blockDim.y + threadIdx.y;
+    if(idx > N) return;
+
+    // adjust pointers to current token
+    residual += C * idx;
+    normed += C * idx;
+    inp1 += C * idx;
+    inp2 += C * idx;
+
+    const float eps = 1e-5f;
+    float sum = 0.0f;
+    for(int c = threadIdx.x * x128::size; c < C; c += WarpSize * x128::size) {
+        const x128 in1 = load128cs(inp1 + c);
+        const x128 in2 = load128cs(inp2 + c);
+        x128 out;
+        for(int k = 0; k < x128::size; ++k) {
+            out[k] = (float)in1[k] + (float)in2[k];
+            sum += (float)out[k];
+        }
+        store128cs(residual + c, out);
+        s_res[c / x128::size] = out;
+    }
+
+    sum = warpReduceSum(sum);
+    float m = sum / C;
+    float v = 0.f;
+
+    for(int c = threadIdx.x * x128::size; c < C; c += WarpSize * x128::size) {
+        const x128 res = s_res[c / x128::size];
+        for(int k = 0; k < x128::size; ++k) {
+            v += ((float)res[k] - m) * ((float)res[k] - m);
+        }
+    }
+
+    v = warpReduceSum(v) / C;
+    float s = rsqrtf(v + eps);
+
+    for(int c = threadIdx.x * x128::size; c < C; c += WarpSize * x128::size) {
+        const x128 res = s_res[c / x128::size];
+        const x128 w = s_weight[c / x128::size];
+        const x128 b = s_bias[c / x128::size];
+        x128 out;
+        for(int k = 0; k < x128::size; ++k) {
+            float n = s * ((float)res[k] - m); // normalized output
+            float o = n * (float)w[k] + (float)b[k]; // scale and shift it
+            out[k] = o;
+        }
+
+        store128cs(normed + c, out);
+    }
+    // cache the mean and rstd for the backward pass later
+    if(threadIdx.x == 0) {
+        mean[idx] = m;
+        rstd[idx] = s;
+    }
+}
+
+
 // inputs floatX, outputs FP32 (for current FP32-only activation path for this WIP)
 __global__ void permute_kernel(floatX* q, floatX* k, floatX* v,
                                const floatX* inp,
@@ -736,7 +817,7 @@ __global__ void softmax_forward_kernel5(floatX* out, float inv_temperature, cons
     }
 }
 
-__global__ void residual_forward_kernel(floatX* out, floatX* inp1, floatX* inp2, int N) {
+__global__ void residual_forward_kernel(floatX* out, const floatX* inp1, const floatX* inp2, int N) {
     int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
     if (idx >= N) { return; }
 
@@ -1184,7 +1265,7 @@ void encoder_backward(floatX* dwte, floatX* dwpe,
 }
 
 void layernorm_forward(floatX* out, floatX* mean, floatX* rstd,
-                       floatX* inp, floatX* weight, floatX* bias,
+                       floatX* inp, const floatX* weight, const floatX* bias,
                        int B, int T, int C) {
     NVTX_RANGE_FN();
     const int block_size = 512;
@@ -1321,7 +1402,7 @@ void attention_forward(floatX* out, floatX* qkvr, floatX* att,
     cudaCheck(cudaGetLastError());
 }
 
-void residual_forward(floatX* out, floatX* inp1, floatX* inp2, int N) {
+void residual_forward(floatX* out, const floatX* inp1, const floatX* inp2, int N) {
     NVTX_RANGE_FN();
     const int block_size = 256;
     const int grid_size = CEIL_DIV(N, block_size * x128::size);
@@ -1329,6 +1410,31 @@ void residual_forward(floatX* out, floatX* inp1, floatX* inp2, int N) {
     cudaCheck(cudaGetLastError());
 }
 
+void fused_residual_forward5(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
+                             const floatX* inp1, const floatX* inp2,
+                             const floatX* weight, const floatX* bias,
+                             int N, int C) {
+    const int block_size = 256;
+    int block_y = block_size / 32;
+    const int grid_size = CEIL_DIV(N, block_y);
+    size_t smem = (2 + block_y) * C * sizeof(floatX);
+
+    // in order to use more than 48 KiB of smem, need to call cudaFuncSetAttribute
+    // this may fail, in which case we fall back to the smem free implementation.
+    cudaCheck(cudaGetLastError());
+    auto status = cudaFuncSetAttribute(fused_residual_forward_kernel5, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
+    cudaGetLastError();
+    if(status == cudaSuccess) {
+        fused_residual_forward_kernel5<<<grid_size, dim3(32, block_y), smem>>>(residual, normed, mean, rstd, inp1, inp2,
+                                                                               weight, bias, N, C);
+    } else {
+        residual_forward(residual, inp1, inp2, N*C);
+        layernorm_forward(normed, mean, rstd, residual, weight, bias, N, 1, C);
+    }
+    cudaCheck(cudaGetLastError());
+}
+
+
 void gelu_forward(floatX* out, const floatX* inp, int N) {
     NVTX_RANGE_FN();
     const int block_size = 512;
@@ -1855,17 +1961,17 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo
     // forward pass
     ParameterTensors params = model->params; // for brevity
     ActivationTensors acts = model->acts;
-    floatX* residual;
     encoder_forward(acts.encoded, model->inputs, params.wte, params.wpe, B, T, C); // encoding goes into residual[0]
 
+    // first layernorm isn't fused
+    layernorm_forward(acts.ln1, acts.ln1_mean, acts.ln1_rstd, acts.encoded, params.ln1w, params.ln1b, B, T, C);
+
     for (int l = 0; l < L; l++) {
         NvtxRange layer_range("Layer", l);
 
-        residual = l == 0 ? acts.encoded : acts.residual3 + (l-1) * B * T * C;
+        floatX* residual = l == 0 ? acts.encoded : acts.residual3 + (l-1) * B * T * C;
 
         // get the pointers of the weights for this layer
-        floatX* l_ln1w = params.ln1w + l * C;
-        floatX* l_ln1b = params.ln1b + l * C;
         floatX* l_qkvw = params.qkvw + l * 3*C * C;
         floatX* l_qkvb = params.qkvb + l * 3*C;
         floatX* l_attprojw = params.attprojw + l * C * C;
@@ -1879,8 +1985,6 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo
 
         // get the pointers of the activations for this layer
         floatX* l_ln1 = acts.ln1 + l * B * T * C;
-        floatX* l_ln1_mean = acts.ln1_mean + l * B * T;
-        floatX* l_ln1_rstd = acts.ln1_rstd + l * B * T;
         floatX* l_qkvr = acts.qkvr + l * B * T * 3*C;
         floatX* l_atty = acts.atty + l * B * T * C;
         floatX* l_attproj = acts.attproj + l * B * T * C;
@@ -1894,8 +1998,6 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo
         floatX* l_residual3 = acts.residual3 + l * B * T * C;
 
         // now do the forward pass
-        layernorm_forward(l_ln1, l_ln1_mean, l_ln1_rstd, residual, l_ln1w, l_ln1b, B, T, C);
-
         #ifdef ENABLE_CUDNN
         float* l_att = (float*)acts.att + l * B * NH * T; // cuDNN needs a smaller FP32 tensor
         matmul_forward_cublaslt(l_qkvr, l_ln1, l_qkvw, l_qkvb, B, T, C, 3*C);
@@ -1910,16 +2012,27 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo
         #endif
 
         matmul_forward_cublaslt(l_attproj, l_atty, l_attprojw, l_attprojb, B, T, C, C);
-        residual_forward(l_residual2, residual, l_attproj, B*T*C);
-        layernorm_forward(l_ln2, l_ln2_mean, l_ln2_rstd, l_residual2, l_ln2w, l_ln2b, B, T, C);
+        fused_residual_forward5(l_residual2, l_ln2, l_ln2_mean, l_ln2_rstd, residual, l_attproj, l_ln2w, l_ln2b, B*T, C);
         matmul_forward_cublaslt(l_fch, l_ln2, l_fcw, l_fcb, B, T, C, 4*C);
         gelu_forward(l_fch_gelu, l_fch, B*T*4*C);
         matmul_forward_cublaslt(l_fcproj, l_fch_gelu, l_fcprojw, l_fcprojb, B, T, 4*C, C);
-        residual_forward(l_residual3, l_residual2, l_fcproj, B*T*C);
+
+        // OK, fusion across blocks.
+        if(l+1 != L) {
+            floatX* l_ln1 = acts.ln1 + (l + 1) * B * T * C;
+            floatX* l_ln1_mean = acts.ln1_mean + (l + 1) * B * T;
+            floatX* l_ln1_rstd = acts.ln1_rstd + (l + 1) * B * T;
+            const floatX* l_ln1w = params.ln1w + (l + 1) * C;
+            const floatX* l_ln1b = params.ln1b + (l + 1) * C;
+            fused_residual_forward5(l_residual3, l_ln1, l_ln1_mean, l_ln1_rstd, l_residual2, l_fcproj, l_ln1w, l_ln1b,
+                                    B * T, C);
+        } else {
+            fused_residual_forward5(l_residual3, acts.lnf, acts.lnf_mean, acts.lnf_rstd, l_residual2, l_fcproj,
+                                    params.lnfw, params.lnfb,
+                                    B * T, C);
+        }
     }
 
-    residual = acts.residual3 + (L-1) * B * T * C; // last residual is in residual3
-    layernorm_forward(acts.lnf, acts.lnf_mean, acts.lnf_rstd, residual, params.lnfw, params.lnfb, B, T, C);
     matmul_forward_cublaslt(acts.output, acts.lnf, params.wte, NULL, B, T, C, Vp);
 
     // also forward the cross-entropy loss function if we have the targets

From 5920143d47ad4f7ecde47941e31ebff6bf0aea98 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Fri, 10 May 2024 00:02:37 +0200
Subject: [PATCH 049/172] remove warning noise

---
 dev/cuda/common.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dev/cuda/common.h b/dev/cuda/common.h
index 5da54fe1d..0c2079821 100644
--- a/dev/cuda/common.h
+++ b/dev/cuda/common.h
@@ -71,7 +71,9 @@ int cuda_threads_per_SM = 0;    // needed to calculate how many blocks to launch
 
 template<class ElementType>
 struct alignas(16) Packed128 {
-    __device__ Packed128() = default;
+    // Note: = default implicitly generates a __device__ function, but explicitly
+    // adding __device__ causes a lot of warnings.
+    Packed128() = default;
     __device__ explicit Packed128(int4 bits) {
         static_assert(sizeof(bits) == sizeof(payload), "Size mismatch.");
         memcpy(&payload, &bits, sizeof(bits));

From 75ec629f5215e6a91758dd0d0a14e2416150d1ce Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Fri, 10 May 2024 00:03:30 +0200
Subject: [PATCH 050/172] remove duplicate function

---
 dev/cuda/layernorm_backward.cu | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/dev/cuda/layernorm_backward.cu b/dev/cuda/layernorm_backward.cu
index b3084e126..904a57e0c 100644
--- a/dev/cuda/layernorm_backward.cu
+++ b/dev/cuda/layernorm_backward.cu
@@ -113,13 +113,6 @@ void layernorm_backward_cpu(float* dinp, float* dweight, float* dbias,
 // GPU kernels
 
 // GPU helper functions for atomicAdd on smaller than 32-bit types
-__device__ float warpReduceSum(float val) {
-    for (int offset = 16; offset > 0; offset /= 2) {
-        val += __shfl_xor_sync(0xFFFFFFFF, val, offset);
-    }
-    return val;
-}
-
 #ifdef ENABLE_BF16
 __device__ void atomicAddX(__nv_bfloat16* addr, __nv_bfloat16 val) {
     uintptr_t ptr_val = reinterpret_cast<uintptr_t>(addr);

From 8ccf2f97f8ed2c05318de0d915132d9fe2eee060 Mon Sep 17 00:00:00 2001
From: Yijun Yu <y.yu@open.ac.uk>
Date: Fri, 10 May 2024 06:49:24 +0800
Subject: [PATCH 051/172] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 29d53689e..f87161261 100644
--- a/README.md
+++ b/README.md
@@ -362,6 +362,7 @@ Lastly, I will be a lot more sensitive to complexity in the root folder of the p
   - [llm.🔥](https://github.com/dorjeduck/llm.mojo) by @[dorjeduck](https://github.com/dorjeduck): a Mojo port of this project
 
 - Rust
+  -  [llm.rs](https://github.com/yijunyu/llm.rs) by @[Yijun Yu](https://github.com/yijunyu): a Rust rewrite with the aim to have same performance
   -  [llm.rs](https://github.com/ToJen/llm.rs) by @[ToJen](https://github.com/ToJen): a Rust port of this project
 
 - Zig

From 5c90845f7a3e9e81d480edbc7db4fd58278d19fb Mon Sep 17 00:00:00 2001
From: lancer <tangshao28@gmail.com>
Date: Thu, 9 May 2024 22:04:02 -0700
Subject: [PATCH 052/172] update the -lcublas -lcublasLt flag in the comment

---
 dev/cuda/adamw.cu                         | 4 ++--
 dev/cuda/attention_backward.cu            | 2 +-
 dev/cuda/attention_forward.cu             | 4 ++--
 dev/cuda/crossentropy_forward.cu          | 2 +-
 dev/cuda/crossentropy_softmax_backward.cu | 2 +-
 dev/cuda/encoder_backward.cu              | 2 +-
 dev/cuda/encoder_forward.cu               | 2 +-
 dev/cuda/fused_residual_forward.cu        | 2 +-
 dev/cuda/gelu_backward.cu                 | 2 +-
 dev/cuda/gelu_forward.cu                  | 2 +-
 dev/cuda/layernorm_backward.cu            | 2 +-
 dev/cuda/layernorm_forward.cu             | 2 +-
 dev/cuda/matmul_backward.cu               | 2 +-
 dev/cuda/matmul_backward_bias.cu          | 2 +-
 dev/cuda/nccl_all_reduce.cu               | 2 +-
 dev/cuda/residual_forward.cu              | 2 +-
 dev/cuda/softmax_forward.cu               | 2 +-
 dev/cuda/trimat_forward.cu                | 2 +-
 18 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/dev/cuda/adamw.cu b/dev/cuda/adamw.cu
index 23770b2c3..20a6560dd 100644
--- a/dev/cuda/adamw.cu
+++ b/dev/cuda/adamw.cu
@@ -6,8 +6,8 @@ References:
   * https://github.com/nvidia/apex/blob/master/csrc/multi_tensor_adam.cu
 
 Compile example:
-nvcc adamw.cu -o adamw
-nvcc -O3 --use_fast_math adamw.cu -o adamw
+nvcc -lcublas -lcublasLt adamw.cu -o adamw
+nvcc -O3 --use_fast_math -lcublas -lcublasLt adamw.cu -o adamw
 
 ./adamw
 
diff --git a/dev/cuda/attention_backward.cu b/dev/cuda/attention_backward.cu
index 8e673d79f..c97dbeee8 100644
--- a/dev/cuda/attention_backward.cu
+++ b/dev/cuda/attention_backward.cu
@@ -2,7 +2,7 @@
 Kernels for attention backward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math attention_backward.cu -o attention_backward -lcublas
+nvcc -O3 --use_fast_math -lcublas -lcublasLt attention_backward.cu -o attention_backward
 
 version 1 is a naive first version
 OMP_NUM_THREADS=32 ./attention_backward 1
diff --git a/dev/cuda/attention_forward.cu b/dev/cuda/attention_forward.cu
index a7b6fff34..b632b4a66 100644
--- a/dev/cuda/attention_forward.cu
+++ b/dev/cuda/attention_forward.cu
@@ -6,10 +6,10 @@ If you do not have CUDNN, you can remove ENABLE_CUDNN to run the other kernels
 See the README for cuDNN install instructions
 
 Compile example with cuDNN:
-nvcc -I/PATH/TO/cudnn-frontend/include -DENABLE_CUDNN -O3 --use_fast_math -lcublas -lcudnn attention_forward.cu -o attention_forward
+nvcc -I/PATH/TO/cudnn-frontend/include -DENABLE_CUDNN -O3 --use_fast_math --lcublas -lcublasLt -lcudnn attention_forward.cu -o attention_forward
 
 Compile example without cuDNN:
-nvcc -O3 --use_fast_math -lcublas attention_forward.cu -o attention_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt attention_forward.cu -o attention_forward
 
 version 1 is naive port from CPU code to kernel, parallelize over batch, time, heads only
 ./attention_forward 1
diff --git a/dev/cuda/crossentropy_forward.cu b/dev/cuda/crossentropy_forward.cu
index 2385a6c4f..ca312ba36 100644
--- a/dev/cuda/crossentropy_forward.cu
+++ b/dev/cuda/crossentropy_forward.cu
@@ -2,7 +2,7 @@
 Kernels for crossentropy forward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math crossentropy_forward.cu -o crossentropy_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt crossentropy_forward.cu -o crossentropy_forward
 
 version 1 is a straight-forward port from CPU code to kernel, parallel over B,T
 ./crossentropy_forward 1
diff --git a/dev/cuda/crossentropy_softmax_backward.cu b/dev/cuda/crossentropy_softmax_backward.cu
index 164bceddf..27521bf60 100644
--- a/dev/cuda/crossentropy_softmax_backward.cu
+++ b/dev/cuda/crossentropy_softmax_backward.cu
@@ -2,7 +2,7 @@
 Kernels for crossentropy forward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math crossentropy_softmax_backward.cu -o crossentropy_softmax_backward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt crossentropy_softmax_backward.cu -o crossentropy_softmax_backward
 
 version 1 is a straight-forward port from CPU code to kernel, parallel over B,T
 ./crossentropy_softmax_backward 1
diff --git a/dev/cuda/encoder_backward.cu b/dev/cuda/encoder_backward.cu
index 8c96eaf46..53221878e 100644
--- a/dev/cuda/encoder_backward.cu
+++ b/dev/cuda/encoder_backward.cu
@@ -2,7 +2,7 @@
 Kernels for the positional encoder forward pass in GPT-2.
 
 Compile example:
-nvcc -O3 --use_fast_math encoder_backward.cu -o encoder_backward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt encoder_backward.cu -o encoder_backward
 
 version 1 is naive port from CPU code to kernel
 parallelizes over B,T,C, uses atomics to add to dwte, dwpe
diff --git a/dev/cuda/encoder_forward.cu b/dev/cuda/encoder_forward.cu
index e901fd654..39d5f0fa3 100644
--- a/dev/cuda/encoder_forward.cu
+++ b/dev/cuda/encoder_forward.cu
@@ -2,7 +2,7 @@
 Kernels for the positional encoder forward pass in GPT-2.
 
 Compile example:
-nvcc -O3 --use_fast_math encoder_forward.cu -o encoder_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt encoder_forward.cu -o encoder_forward
 
 version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C
 ./encoder_forward 1
diff --git a/dev/cuda/fused_residual_forward.cu b/dev/cuda/fused_residual_forward.cu
index f228503af..b98a67c4b 100644
--- a/dev/cuda/fused_residual_forward.cu
+++ b/dev/cuda/fused_residual_forward.cu
@@ -2,7 +2,7 @@
 Kernels for residual forward pass fused with layernorm
 
 Compile example:
-nvcc -O3 --use_fast_math fused_residual_forward.cu -o fused_residual_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt fused_residual_forward.cu -o fused_residual_forward
 
 version 1 is naive port from CPU code to kernel
 ./fused_residual_forward 1
diff --git a/dev/cuda/gelu_backward.cu b/dev/cuda/gelu_backward.cu
index bbd81c4bc..3d12dd864 100644
--- a/dev/cuda/gelu_backward.cu
+++ b/dev/cuda/gelu_backward.cu
@@ -2,7 +2,7 @@
 Kernels for gelu backward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math gelu_backward.cu -o gelu_backward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt gelu_backward.cu -o gelu_backward
 
 If encountering "error: identifier "M_PI" is undefined", add the following lines to the top of the file:
 
diff --git a/dev/cuda/gelu_forward.cu b/dev/cuda/gelu_forward.cu
index e07ad663a..01abfe2b5 100644
--- a/dev/cuda/gelu_forward.cu
+++ b/dev/cuda/gelu_forward.cu
@@ -2,7 +2,7 @@
 Kernels for gelu forward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math gelu_forward.cu -o gelu_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt gelu_forward.cu -o gelu_forward
 
 If encountering "error: identifier "M_PI" is undefined", add the following lines to the top of the file:
 
diff --git a/dev/cuda/layernorm_backward.cu b/dev/cuda/layernorm_backward.cu
index 904a57e0c..575e0a962 100644
--- a/dev/cuda/layernorm_backward.cu
+++ b/dev/cuda/layernorm_backward.cu
@@ -2,7 +2,7 @@
 Kernels for layernorm backward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math layernorm_backward.cu -o layernorm_backward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt layernorm_backward.cu -o layernorm_backward
 
 version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C
 ./layernorm_backward 1
diff --git a/dev/cuda/layernorm_forward.cu b/dev/cuda/layernorm_forward.cu
index 5cefd408e..3e948289a 100644
--- a/dev/cuda/layernorm_forward.cu
+++ b/dev/cuda/layernorm_forward.cu
@@ -2,7 +2,7 @@
 Kernels for layernorm forward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math layernorm_forward.cu -o layernorm_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt layernorm_forward.cu -o layernorm_forward
 
 version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C
 ./layernorm_forward 1
diff --git a/dev/cuda/matmul_backward.cu b/dev/cuda/matmul_backward.cu
index 9d3763930..dece1f6dc 100644
--- a/dev/cuda/matmul_backward.cu
+++ b/dev/cuda/matmul_backward.cu
@@ -2,7 +2,7 @@
 Kernels for matmul backward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math -Xcompiler -fopenmp matmul_backward.cu -o matmul_backward -lcublas
+nvcc -O3 --use_fast_math -lcublas -lcublasLt -Xcompiler -fopenmp matmul_backward.cu -o matmul_backward
 
 OMP_NUM_THREADS=32 ./matmul_backward 1
 */
diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu
index 7aef54547..65b331699 100644
--- a/dev/cuda/matmul_backward_bias.cu
+++ b/dev/cuda/matmul_backward_bias.cu
@@ -2,7 +2,7 @@
 Kernels for matmul backward pass bias only.
 
 Compile example:
-nvcc -O3 matmul_backward_bias.cu -lineinfo -o matmul_backward_bias
+nvcc -O3 -lcublas -lcublasLt matmul_backward_bias.cu -lineinfo -o matmul_backward_bias
 
 ./matmul_backward_bias 1
 ./matmul_backward_bias 2
diff --git a/dev/cuda/nccl_all_reduce.cu b/dev/cuda/nccl_all_reduce.cu
index 3bc9564f1..260ba02ba 100644
--- a/dev/cuda/nccl_all_reduce.cu
+++ b/dev/cuda/nccl_all_reduce.cu
@@ -5,7 +5,7 @@ Fills a vector with 1s on the first GPU, 2s on the second, etc.
 Then aggregates the values in the resulting vectors.
 
 Compile example:
-nvcc -lmpi -lnccl -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-gnu/openmpi/lib/ nccl_all_reduce.cu -o nccl_all_reduce
+nvcc -lmpi -lnccl -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-gnu/openmpi/lib/ -lcublas -lcublasLt nccl_all_reduce.cu -o nccl_all_reduce
 
 Run on 2 local GPUs (set -np to a different value to change GPU count):
 mpirun -np 2 ./nccl_all_reduce
diff --git a/dev/cuda/residual_forward.cu b/dev/cuda/residual_forward.cu
index f07871a29..fd7d1fb8e 100644
--- a/dev/cuda/residual_forward.cu
+++ b/dev/cuda/residual_forward.cu
@@ -2,7 +2,7 @@
 Kernels for residual forward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math residual_forward.cu -o residual_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt residual_forward.cu -o residual_forward
 
 version 1 is naive port from CPU code to kernel
 ./residual_forward 1
diff --git a/dev/cuda/softmax_forward.cu b/dev/cuda/softmax_forward.cu
index d0d38850d..279549b28 100644
--- a/dev/cuda/softmax_forward.cu
+++ b/dev/cuda/softmax_forward.cu
@@ -2,7 +2,7 @@
 Kernels for softmax forward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math softmax_forward.cu -o softmax_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt softmax_forward.cu -o softmax_forward
 
 version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C
 ./softmax_forward 1
diff --git a/dev/cuda/trimat_forward.cu b/dev/cuda/trimat_forward.cu
index 133ced16f..1c093e2a1 100644
--- a/dev/cuda/trimat_forward.cu
+++ b/dev/cuda/trimat_forward.cu
@@ -3,7 +3,7 @@ Triangular matrix multiplication as in autoregressive attention. A short story.
 by @ngc92
 
 Compile:
-nvcc -O3 --use_fast_math trimat_forward.cu -o trimat_forward -lcublas
+nvcc -O3 --use_fast_math -lcublas -lcublasLt trimat_forward.cu -o trimat_forward -lcublas
 
 Run:
 

From 6da5e63e2c25ba6c53d8e16438fcf595f81ac44d Mon Sep 17 00:00:00 2001
From: Marco van Zwetselaar <io@zwets.it>
Date: Fri, 10 May 2024 10:57:39 +0300
Subject: [PATCH 053/172] Fix detection of cudnn-frontend in '.' on Linux

Plus minor legibility fixes in the cudnn-frontend explanation.
---
 Makefile  | 10 ++++------
 README.md |  2 +-
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index eabb5646d..46abdc9a5 100644
--- a/Makefile
+++ b/Makefile
@@ -19,8 +19,7 @@ NVCC_INCLUDES =
 NVCC_LDLIBS =
 NCLL_INCUDES =
 NVCC_CUDNN =
-# overridable flag for multi-GPU training. by default we won't build with cudnn
-# because it bloats up the compile time from a few seconds to ~minute
+# By default we don't build with cudnn because it blows up compile time from a few seconds to ~minute
 USE_CUDNN ?= 0
 
 # Function to check if a file exists in the PATH
@@ -86,16 +85,15 @@ else
 endif
 
 # Check and include cudnn if available
-# You can override the path to cudnn frontend by setting CUDNN_FRONTEND_PATH=your_path on the make command line
+# You can override the path to cudnn frontend by setting CUDNN_FRONTEND_PATH on the make command line
+# By default, we look for it in HOME/cudnn-frontend/include and ./cudnn-frontend/include
 # Refer to the README for cuDNN install instructions
 ifeq ($(USE_CUDNN), 1)
   ifeq ($(SHELL_UNAME), Linux)
-    # hard-coded path for now in either . or ($HOME) directory 
-    # this can be overridden by setting CUDNN_FRONTEND_PATH on the command line
     ifeq ($(shell [ -d $(HOME)/cudnn-frontend/include ] && echo "exists"), exists)
       $(info ✓ cuDNN found, will run with flash-attention)
       CUDNN_FRONTEND_PATH ?= $(HOME)/cudnn-frontend/include
-    else ifeq ($(shell [ -d cudnn-frontend/include ] && echo "exists"),)
+    else ifeq ($(shell [ -d cudnn-frontend/include ] && echo "exists"), exists)
       $(info ✓ cuDNN found, will run with flash-attention)
       CUDNN_FRONTEND_PATH ?= cudnn-frontend/include
     else
diff --git a/README.md b/README.md
index f87161261..b4298acff 100644
--- a/README.md
+++ b/README.md
@@ -265,7 +265,7 @@ sudo apt-get update
 sudo apt-get -y install libcudnn9-dev-cuda-12
 ```
 
-On top of this you need the [cuDNN frontend](https://github.com/NVIDIA/cudnn-frontend/tree/main), but this is just header files. So simply download the repo to your disk, currently assumed to be in your home directory (i.e. the Makefile looks for `~/cudnn-frontend/include`).
+On top of this you need the [cuDNN frontend](https://github.com/NVIDIA/cudnn-frontend/tree/main), but this is just header files. Simply clone the repo to your disk. The Makefile currently looks for it in either your home directory or the current directory. If you have put it elsewhere, add `CUDNN_FRONTEND_PATH=/path/to/your/cudnn-frontend/include` to the `make` command-line.
 
 **Multi-GPU training**. As of April 26, 2024 there is now also support for multi-GPU training using MPI and NCCL. Make sure you install MPI, e.g. on Linux:
 

From 3dbb0bb89cd8115c25bd4f423bed390ea011fd8b Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sat, 11 May 2024 17:19:41 +0000
Subject: [PATCH 054/172] bump the threshold for qkvw because flashattention
 expands the error here a tiny bit

---
 test_gpt2.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_gpt2.cu b/test_gpt2.cu
index 7613b6ba3..654e35db1 100644
--- a/test_gpt2.cu
+++ b/test_gpt2.cu
@@ -247,7 +247,7 @@ int main(int argc, char *argv[]) {
             // In that case it's ok to extend the tolerance by a bit, after a manual review.
             allok = allok & check_tensor(tensors1[0], tensors2[0], V * C, "wte", 8e-1f);
             allok = allok & check_tensor(tensors1[1], tensors2[1], maxT * C, "wpe", 1e-2f);
-            allok = allok & check_tensor(tensors1[2], tensors2[2], L * 3*C * C, "qkvw", 1.1e-1); // hmm a bit high
+            allok = allok & check_tensor(tensors1[2], tensors2[2], L * 3*C * C, "qkvw", 1.4e-1); // hmm a bit high
             allok = allok & check_tensor(tensors1[3], tensors2[3], L * 3*C, "qkvb", 4e-2f);
             allok = allok & check_tensor(tensors1[4], tensors2[4], L * C * C, "attprojw", 3e-2f);
             allok = allok & check_tensor(tensors1[5], tensors2[5], L * C, "attprojb", 3e-2f);

From b88f683569423adae51d878f78bfe71df973c6c1 Mon Sep 17 00:00:00 2001
From: Ross Wheeler <ross@rosswheeler.com>
Date: Sat, 11 May 2024 17:49:22 -0700
Subject: [PATCH 055/172] Add Windows to CI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Change notes:
•	Add Windows build/test to CI matrix build
•	Add Cuda Windows build
•	Replace v3 checkout actions to v4 per GitHub recommendations
---
 .github/workflows/ci.yml | 100 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 94 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 52715eb9c..bb19f2ba5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -12,15 +12,16 @@ jobs:
   build-and-test-cpu:
     strategy:
       matrix:
-        os: [ubuntu-latest, macos-latest]
+        os: [ubuntu-latest, macos-latest, windows-latest]
 
     runs-on: ${{ matrix.os }}
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Install OpenMP
+        if: matrix.os != 'windows-latest'    
         run: |
           if [ "${{ runner.os }}" == "Linux" ]; then
             sudo apt-get update && sudo apt-get install -y libomp-dev
@@ -37,18 +38,105 @@ jobs:
       - name: Train model
         run: python train_gpt2.py --device=cpu
 
+      - name: Download Win32 Make.exe
+        if: matrix.os == 'windows-latest'
+        run: |
+            $wc = New-Object System.Net.WebClient
+            $url = 'https://github.com/maweil/MakeForWindows/releases/download/v4.4.1/make-bin-win64.zip'
+            $output = './make-bin-win64.zip'
+            $wc.DownloadFile($url, $output)
+  
+      - name: Unzip Win32 Makefile
+        if: matrix.os == 'windows-latest'      
+        run: |
+          unzip make-bin-win64.zip
+
       - name: Compile training and testing program
+        if: matrix.os != 'windows-latest'
         run: make test_gpt2 train_gpt2
 
+      - name: Compile training and testing program for Windows
+        if: matrix.os == 'windows-latest'
+        shell: cmd
+        run: |
+          call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\VC\\Auxiliary\\Build\\vcvars64.bat"        
+          make-4.4.1\dist\make WIN_CI_BUILD=1 test_gpt2 train_gpt2
+
       - name: Execute testing program (With OpenMP)
+        if: matrix.os != 'windows-latest'      
         run: OMP_NUM_THREADS=8 ./test_gpt2
 
+      - name: Execute Windows testing program (With OpenMP) 
+        if: matrix.os == 'windows-latest'      
+        shell: cmd
+        run: |
+          copy test_gpt2 test_gpt2.exe
+          test_gpt2.exe        
+
       - name: Compile training and testing program without OpenMP
+        if: matrix.os != 'windows-latest'      
         run: NO_OMP=1 make test_gpt2 train_gpt2
 
       - name: Execute testing program (No OpenMP)
+        if: matrix.os != 'windows-latest'      
         run: ./test_gpt2
 
+  build-cuda-windows:
+    runs-on: windows-latest
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Download Win32 Make.exe
+      run: |
+          $wc = New-Object System.Net.WebClient
+          $url = 'https://github.com/maweil/MakeForWindows/releases/download/v4.4.1/make-bin-win64.zip'
+          $output = './make-bin-win64.zip'
+          $wc.DownloadFile($url, $output)
+  
+    - name: Unzip Win32 Makefile
+      run: |
+        unzip make-bin-win64.zip
+      
+    - name: Install Cuda Toolkit 12.4 on Windows
+      run: |
+        mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+        choco install unzip -y
+        curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
+        curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
+        curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
+        curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
+        curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
+        curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
+        curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
+        curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
+        curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
+        unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+        xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+        xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+        xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+        xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+        xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+        xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+        xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+        xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+        xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+
+    # Default installation path for CUDA Toolkit is C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4
+    - name: Add Path
+      run: |
+        echo "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.4\\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+        echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+        echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+        echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+
+    - name: Build Cuda targets
+      shell: cmd
+      working-directory: ${{ github.workspace }}
+      run: |
+        call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\VC\\Auxiliary\\Build\\vcvars64.bat"        
+        make-4.4.1\dist\make -j WIN_CI_BUILD=1 train_gpt2fp32cu test_gpt2fp32cu test_gpt2cu train_gpt2cu profile_gpt2cu
+  
   build-cuda-fp32:
     runs-on: ubuntu-latest
     container:
@@ -56,7 +144,7 @@ jobs:
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Build FP32 checkpoint
         run: make train_gpt2fp32cu test_gpt2fp32cu
@@ -71,7 +159,7 @@ jobs:
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Build project
         run: PRECISION=BF16 make test_gpt2cu train_gpt2cu profile_gpt2cu
@@ -83,7 +171,7 @@ jobs:
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Build project
         run: PRECISION=FP16 make test_gpt2cu train_gpt2cu profile_gpt2cu
@@ -95,7 +183,7 @@ jobs:
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Install OpenMP and OpenMPI
         run: apt-get update && apt-get install -y libomp-dev libopenmpi-dev

From e64df911910bfd1f235d8f0e0e97478c67d16e53 Mon Sep 17 00:00:00 2001
From: ntr <ntr@ihfood.dk>
Date: Sun, 12 May 2024 16:02:36 +0200
Subject: [PATCH 056/172] Add Llm.cs by nietras to README

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index b4298acff..41cef25af 100644
--- a/README.md
+++ b/README.md
@@ -344,6 +344,7 @@ Lastly, I will be a lot more sensitive to complexity in the root folder of the p
 
 - C#
   - [llm.cs](https://github.com/azret/llm.cs) by @[azret](https://github.com/azret): a C# port of this project
+  - [Llm.cs](https://github.com/nietras/Llm.cs) by @[nietras](https://github.com/nietras): a C# port of this project with focus on easy to get started on any platform. Clone and run ✅
 
 - CUDA C++
   - [llm.cpp](https://github.com/gevtushenko/llm.c) by @[gevtushenko](https://github.com/gevtushenko): a port of this project using the [CUDA C++ Core Libraries](https://github.com/NVIDIA/cccl)

From ec92368f8b34dda6e5821556db0581dde9e0a75f Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Sun, 12 May 2024 19:24:43 +0300
Subject: [PATCH 057/172] added current backward bias kernel to dev cuda

---
 dev/cuda/matmul_backward_bias.cu | 95 ++++++++++++++++++++++++++++++++
 train_gpt2.cu                    |  4 +-
 2 files changed, 97 insertions(+), 2 deletions(-)

diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu
index 65b331699..233b7a197 100644
--- a/dev/cuda/matmul_backward_bias.cu
+++ b/dev/cuda/matmul_backward_bias.cu
@@ -22,6 +22,8 @@ sudo ncu --set full --import-source yes -o bias -f ./matmul_backward_bias 1
 #include <omp.h>
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
+
+//#define ENABLE_BF16
 #include "common.h"
 
 // ----------------------------------------------------------------------------
@@ -45,6 +47,8 @@ void matmul_backward_bias_cpu(float* dinp, float* dweight, float* dbias,
 // ----------------------------------------------------------------------------
 // GPU kernels
 
+float* dbias_buffer;
+
 __global__ void matmul_backward_bias_kernel1(float* dbias, const float* dout, int B, int T, int OC) {
     extern __shared__ float shared[];
     int o = blockIdx.x; // range [0, OC)
@@ -180,6 +184,66 @@ __global__ void matmul_backward_bias_kernel5(float* dbias, const float* dout, in
 }
 
 
+__global__ void cast_and_add_kernel(floatX* dst, const float* src, size_t n) {
+    // used only for matmul_backward_bias kernel, a little bit embarassing TODO delete later
+    const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n) { dst[idx] = (floatX)((float)dst[idx] + src[idx]); } // have to += because dbias is a paramater
+}
+
+__global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, int B, int T, int OC, const int block_size) {
+    // note: this kernel reads in floatX, but it writes to float!
+    // this is because we're using atomics, which are super slow in < fp32 precision on < H100 GPUs
+    // so the trick is do fp32 atomics to a buffer, and then copy_and_cast the result to floatX
+    // (this also results in higher accuracy than doing accumulation directly in floatX)
+
+    // see comments in matmul_backward() for an explanation of block/grid dimensions etc.
+    const int block_size_x = 32;
+    const int block_size_y = block_size / block_size_x; // 16
+    const int OC_per_warp = block_size_x * x128::size;  // 256 at BF16
+
+    int local_oc = threadIdx.x * x128::size;
+    int global_oc = blockIdx.x * OC_per_warp + local_oc;
+    float accumulators[x128::size];
+    extern __shared__ float shared[];
+
+    for (int k = 0; k < x128::size; k++) {
+        accumulators[k] = 0.0f;
+    }
+    int thread_id = threadIdx.y * block_size_x + threadIdx.x;
+    for (int idx = thread_id; idx < OC_per_warp; idx += block_size) {
+        shared[idx] = 0.0f;
+    }
+    __syncthreads();
+    if(global_oc < OC) {
+        for (int idx = blockIdx.y*block_size_y + threadIdx.y; idx < B * T; idx += gridDim.y*block_size_y) {
+            x128 packed_dout = load128(dout + global_oc + idx*OC);
+            for (int k = 0; k < x128::size; k++) {
+                accumulators[k] += (float)packed_dout[k];
+            }
+        }
+        // we need to avoid shared memory bank conflicts for the atomicAdd to maximise performance,
+        // so we accumulate in a conflict-free order, then reorder to match the global memory order
+        for (int k = 0; k < x128::size; k++) {
+            atomicAdd(shared + threadIdx.x + (k * block_size_x), accumulators[k]);
+        }
+    }
+    if (threadIdx.y >= x128::size) { return; } // only need this many warps to reorder the data
+    __syncthreads();
+    // read the accumulated values in the conflict-free order
+    int i = threadIdx.x + (threadIdx.y * block_size_x);
+    float tmp = shared[i];
+    __syncthreads();
+    // write them back to shared memory in the global memory order
+    // 8-way bank conflict for BF16 x128, but only 8x per threadblock (rather than 8x per warp)
+    shared[local_oc + threadIdx.y] = tmp;
+    __syncthreads();
+    // now we do a perfectly coalesced atomic add to global memory (1x 128-byte cacheline per warp)
+    if (i + blockIdx.x*OC_per_warp < OC) {
+        atomicAdd(dbias + i + blockIdx.x*OC_per_warp, shared[i]);
+    }
+}
+
+
 // ----------------------------------------------------------------------------
 // kernel launcher
 
@@ -224,6 +288,33 @@ void matmul_backward_bias5(float* dinp, float* dweight, float* dbias,
     matmul_backward_bias_kernel5<<<dim3(grid_size_x, grid_size_y), dim3(block_size)>>>(dbias, dout, B, T, OC);
 }
 
+void matmul_backward_bias7(float* dinp, float* dweight, float* dbias,
+                      float* dout, float* inp, float* weight, float* ones,
+                      int B, int T, int C, int OC, int block_size) {
+    if(block_size < 128) {
+        block_size = 128;
+    }
+    // Each warp is responsible for 32 * "x128::size" = 256 OCs at BF16 (OC must be a multiple of 256!)
+    // Block size is 512 threads (16 warps) and we reduce those 16 values into 1 at the end
+    // blockDim.x is 32 --> single warp being responsible for those 256 OCs
+    // blockDim.y is 16 --> 16 parallel independent warps processing the same OCs for different BTs
+    // gridDim.x is OC / 256 --> each block processes 256 OCs
+    // grimDim.y is max(1, (cuda_num_SMs * threads_per_SM) / (512 * gridDim.x)); --> fill up the entire GPU!
+    const int warp_size = 32;
+    const int OC_per_warp = warp_size * x128::size; // 256 at BF16
+    const int block_size_x = 32;
+    const int block_size_y = block_size / block_size_x; // 16
+    const int grid_size_x = ceil_div(OC, OC_per_warp); // e.g. 3 horizontal blocks for 768 OCs at BF16
+    const int grid_size_y = max(1, cuda_threads_per_SM * cuda_num_SMs / (block_size * grid_size_x)); // full GPU!
+
+    assert(block_size_y >= x128::size); // part of the kernel assumes this is large enough to avoid loops
+
+    cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float));
+    matmul_backward_bias_kernel7<<<dim3(grid_size_x, grid_size_y),
+    dim3(block_size_x, block_size_y), OC_per_warp * sizeof(float)>>>(dbias_buffer, dout, B, T, OC, block_size);
+    cast_and_add_kernel<<<ceil_div(OC, 256), 256, 0>>>(dbias, dbias_buffer, OC);
+}
+
 void matmul_backward_bias(int kernel_num,
                      float* dinp, float* dweight, float* dbias,
                      float* dout, float* inp, float* weight, float* ones,
@@ -244,6 +335,9 @@ void matmul_backward_bias(int kernel_num,
         case 5:
             matmul_backward_bias5(dinp, dweight, dbias, dout, inp, weight, ones, B, T, C, OC, block_size);
             break;
+        case 7:
+            matmul_backward_bias7(dinp, dweight, dbias, dout, inp, weight, ones, B, T, C, OC, block_size);
+            break;
         default:
             printf("Invalid kernel number\n");
             exit(1);
@@ -276,6 +370,7 @@ int main(int argc, char **argv) {
     float* d_dout;
     cudaCheck(cudaMalloc(&d_dbias, OC * sizeof(float)));
     cudaCheck(cudaMalloc(&d_dout, B * T * OC * sizeof(float)));
+    cudaCheck(cudaMalloc(&dbias_buffer, OC * sizeof(float)));
     cudaCheck(cudaMemcpy(d_dbias, dbias, OC * sizeof(float), cudaMemcpyHostToDevice));
     cudaCheck(cudaMemcpy(d_dout, dout, B * T * OC * sizeof(float), cudaMemcpyHostToDevice));
 
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 3285aa1d5..1c6cdb136 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -869,7 +869,7 @@ __global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, i
     // note: this kernel reads in floatX, but it writes to float!
     // this is because we're using atomics, which are super slow in < fp32 precision on < H100 GPUs
     // so the trick is do fp32 atomics to a buffer, and then copy_and_cast the result to floatX
-    // (this also results in higher accuracy than doing doing accumulation directly in floatX)
+    // (this also results in higher accuracy than doing accumulation directly in floatX)
 
     // see comments in matmul_backward() for an explanation of block/grid dimensions etc.
     const int block_size = 512;
@@ -897,7 +897,7 @@ __global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, i
                 accumulators[k] += (float)packed_dout[k];
             }
 	}
-	// we need to avoid shared memory bank conflicts for the atomicAdd to maximise performance
+	// we need to avoid shared memory bank conflicts for the atomicAdd to maximise performance,
 	// so we accumulate in a conflict-free order, then reorder to match the global memory order
 	for (int k = 0; k < x128::size; k++) {
             atomicAdd(shared + threadIdx.x + (k * block_size_x), accumulators[k]);

From 2287da01207d8d4d92eaeb12cee0f2093dd46a6a Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Sun, 12 May 2024 19:42:32 +0300
Subject: [PATCH 058/172] enable bf16

---
 dev/cuda/matmul_backward_bias.cu | 89 ++++++++++++++------------------
 1 file changed, 40 insertions(+), 49 deletions(-)

diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu
index 233b7a197..024eca4d6 100644
--- a/dev/cuda/matmul_backward_bias.cu
+++ b/dev/cuda/matmul_backward_bias.cu
@@ -23,7 +23,7 @@ sudo ncu --set full --import-source yes -o bias -f ./matmul_backward_bias 1
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
 
-//#define ENABLE_BF16
+#define ENABLE_BF16
 #include "common.h"
 
 // ----------------------------------------------------------------------------
@@ -49,16 +49,16 @@ void matmul_backward_bias_cpu(float* dinp, float* dweight, float* dbias,
 
 float* dbias_buffer;
 
-__global__ void matmul_backward_bias_kernel1(float* dbias, const float* dout, int B, int T, int OC) {
+__global__ void matmul_backward_bias_kernel1(floatX* dbias, const floatX* dout, int B, int T, int OC) {
     extern __shared__ float shared[];
     int o = blockIdx.x; // range [0, OC)
     int tid = threadIdx.x; // range [0, block_size)
     int block_size = blockDim.x;
-    const float* x = dout + o;
+    const floatX* x = dout + o;
     // thread coarsening
     float sum = 0.0;
     for (int i = tid; i < B * T; i += block_size) {
-        sum += x[i * OC];
+        sum += (float)x[i * OC];
     }
     shared[tid] = sum;
     __syncthreads();
@@ -71,12 +71,12 @@ __global__ void matmul_backward_bias_kernel1(float* dbias, const float* dout, in
     }
     // write the final result (at thread 0) to global memory
     if (tid == 0) {
-        dbias[o] += shared[0];
+        dbias[o] = (float)dbias[o] + shared[0];
     }
 }
 
 // cooperative groups solution, one warp per output channel
-__global__ void matmul_backward_bias_kernel2(float* dbias, const float* dout, int B, int T, int OC) {
+__global__ void matmul_backward_bias_kernel2(floatX* dbias, const floatX* dout, int B, int T, int OC) {
     // dout is (B, T, OC), dbias is (OC)
     // e.g. if block_size = 128, then we have 4 warps per block, each in charge of one output channel
     namespace cg = cooperative_groups;
@@ -89,7 +89,7 @@ __global__ void matmul_backward_bias_kernel2(float* dbias, const float* dout, in
     // first, thread coarsening to sum reduce the problem size from B*T to 32
     float sum = 0.0f;
     for(int i = warp.thread_rank(); i < BT; i += warp.size()) {
-        sum += dout[i * OC + idx];
+        sum += (float)dout[i * OC + idx];
     }
     // now do a warp-level reduce to get the sum across the 32 threads in this warp
     sum = cg::reduce(warp, sum, cg::plus<float>{});
@@ -99,7 +99,7 @@ __global__ void matmul_backward_bias_kernel2(float* dbias, const float* dout, in
     }
 }
 
-__global__ void matmul_backward_bias_kernel3(float* dbias, const float* dout, int B, int T, int OC) {
+__global__ void matmul_backward_bias_kernel3(floatX* dbias, const floatX* dout, int B, int T, int OC) {
     // dout is (B, T, OC), dbias is (OC)
     // in this version of the kernel the entire block of block_size is dedicated to one output channel
     namespace cg = cooperative_groups;
@@ -114,7 +114,7 @@ __global__ void matmul_backward_bias_kernel3(float* dbias, const float* dout, in
     // round 1: thread coarsening to reduce the problem size from B*T to 32
     float thread_sum = 0.0f;
     for(int i = threadIdx.x; i < BT; i += blockDim.x) {
-        thread_sum += dout[i * OC + idx];
+        thread_sum += (float)dout[i * OC + idx];
     }
     // now do a warp-level reduce to get the sum across the 32 threads in each warp
     float warp_sum = cg::reduce(warp, thread_sum, cg::plus<float>{});
@@ -136,7 +136,7 @@ __global__ void matmul_backward_bias_kernel3(float* dbias, const float* dout, in
 // the idea is to employ one block to reduce along several columns,
 // where each block has a width of 32 columns to ensure coalesced access.
 // at the end we accumulate the reductions performed by the warps in each block via shared memory
-__global__ void matmul_backward_bias_kernel4(float* dbias, const float* dout, int B, int T, int OC) {
+__global__ void matmul_backward_bias_kernel4(floatX* dbias, const floatX* dout, int B, int T, int OC) {
     // this kernel is launched with 1D grid_dim of OC/32
     // for example let's say block_size is 128
     extern __shared__ float smem[]; // of size block_size (128)
@@ -147,7 +147,7 @@ __global__ void matmul_backward_bias_kernel4(float* dbias, const float* dout, in
 
     // pointer to the start of the column for one lane of threads
     // so e.g. 4 threads (of the same lane_id) will reduce this one column
-    const float* dout_col = dout + tl + lane_id;
+    const floatX* dout_col = dout + tl + lane_id;
 
     // column reductions by looping through the rows
     // each of the 4 threads offsets by its warp_id and then skips by vstep
@@ -156,7 +156,7 @@ __global__ void matmul_backward_bias_kernel4(float* dbias, const float* dout, in
     // leading to a coalesced memory access pattern
     float dout_sum = 0.0f;
     for (int row = warp_id; row < B * T; row += vstep) {
-        dout_sum += dout_col[row * OC];
+        dout_sum += (float)dout_col[row * OC];
     }
     smem[lane_id + warp_id * warpSize] = dout_sum;
     __syncthreads();
@@ -171,13 +171,13 @@ __global__ void matmul_backward_bias_kernel4(float* dbias, const float* dout, in
     }
 }
 
-__global__ void matmul_backward_bias_kernel5(float* dbias, const float* dout, int B, int T, int OC) {
+__global__ void matmul_backward_bias_kernel5(floatX* dbias, const floatX* dout, int B, int T, int OC) {
     int oc = blockIdx.x * blockDim.x + threadIdx.x;
     if(oc >= OC) return;
     float sum = 0.0;
     // grid-wide loop for maximum parallelism
     for (int i = blockIdx.y; i < B * T; i += gridDim.y) {
-        sum += dout[i * OC + oc];
+        sum += (float)dout[i * OC + oc];
     }
     // and atomically add everything together. atomics within one block are conflict-free!
     atomicAdd(dbias + oc, sum);
@@ -248,8 +248,7 @@ __global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, i
 // kernel launcher
 
 // version1: simple cuBLAS calls
-void matmul_backward_bias1(float* dinp, float* dweight, float* dbias,
-                      float* dout, float* inp, float* weight, float* ones,
+void matmul_backward_bias1(floatX* dbias, floatX* dout,
                       int B, int T, int C, int OC, int block_size) {
     dim3 block_dim(block_size);
     dim3 grid_dim(OC);
@@ -257,42 +256,37 @@ void matmul_backward_bias1(float* dinp, float* dweight, float* dbias,
     matmul_backward_bias_kernel1<<<grid_dim, block_dim, shared_mem_size>>>(dbias, dout, B, T, OC);
 }
 
-void matmul_backward_bias2(float* dinp, float* dweight, float* dbias,
-                      float* dout, float* inp, float* weight, float* ones,
+void matmul_backward_bias2(floatX* dbias, floatX* dout,
                       int B, int T, int C, int OC, int block_size) {
     // block_size 512 seems best
     const int grid_size = ceil_div(OC * 32, block_size);
     matmul_backward_bias_kernel2<<<grid_size, block_size>>>(dbias, dout, B, T, OC);
 }
 
-void matmul_backward_bias3(float* dinp, float* dweight, float* dbias,
-                      float* dout, float* inp, float* weight, float* ones,
+void matmul_backward_bias3(floatX* dbias, floatX* dout,
                       int B, int T, int C, int OC, int block_size) {
     // block_size 256 seems best
     matmul_backward_bias_kernel3<<<OC, block_size>>>(dbias, dout, B, T, OC);
 }
 
-void matmul_backward_bias4(float* dinp, float* dweight, float* dbias,
-                      float* dout, float* inp, float* weight, float* ones,
+void matmul_backward_bias4(floatX* dbias, floatX* dout,
                       int B, int T, int C, int OC, int block_size) {
     assert(OC % 32 == 0); // OC must be divisible by 32 for this kernel
     const int grid_size = OC / 32;
     matmul_backward_bias_kernel4<<<grid_size, block_size, block_size * sizeof(float)>>>(dbias, dout, B, T, OC);
 }
 
-void matmul_backward_bias5(float* dinp, float* dweight, float* dbias,
-                      float* dout, float* inp, float* weight, float* ones,
+void matmul_backward_bias5(floatX* dbias, floatX* dout,
                       int B, int T, int C, int OC, int block_size) {
     const int grid_size_x = ceil_div(OC, block_size);
     const int grid_size_y = max(1, cuda_threads_per_SM * cuda_num_SMs / block_size);
     matmul_backward_bias_kernel5<<<dim3(grid_size_x, grid_size_y), dim3(block_size)>>>(dbias, dout, B, T, OC);
 }
 
-void matmul_backward_bias7(float* dinp, float* dweight, float* dbias,
-                      float* dout, float* inp, float* weight, float* ones,
+void matmul_backward_bias7(floatX* dbias, floatX* dout,
                       int B, int T, int C, int OC, int block_size) {
-    if(block_size < 128) {
-        block_size = 128;
+    if(block_size < 256) {
+        block_size = 256;
     }
     // Each warp is responsible for 32 * "x128::size" = 256 OCs at BF16 (OC must be a multiple of 256!)
     // Block size is 512 threads (16 warps) and we reduce those 16 values into 1 at the end
@@ -315,28 +309,26 @@ void matmul_backward_bias7(float* dinp, float* dweight, float* dbias,
     cast_and_add_kernel<<<ceil_div(OC, 256), 256, 0>>>(dbias, dbias_buffer, OC);
 }
 
-void matmul_backward_bias(int kernel_num,
-                     float* dinp, float* dweight, float* dbias,
-                     float* dout, float* inp, float* weight, float* ones,
+void matmul_backward_bias(int kernel_num, floatX* dbias, floatX* dout,
                      int B, int T, int C, int OC, int block_size) {
     switch (kernel_num) {
         case 1:
-            matmul_backward_bias1(dinp, dweight, dbias, dout, inp, weight, ones, B, T, C, OC, block_size);
+            matmul_backward_bias1(dbias, dout, B, T, C, OC, block_size);
             break;
         case 2:
-            matmul_backward_bias2(dinp, dweight, dbias, dout, inp, weight, ones, B, T, C, OC, block_size);
+            matmul_backward_bias2(dbias, dout, B, T, C, OC, block_size);
             break;
         case 3:
-            matmul_backward_bias3(dinp, dweight, dbias, dout, inp, weight, ones, B, T, C, OC, block_size);
+            matmul_backward_bias3(dbias, dout,  B, T, C, OC, block_size);
             break;
         case 4:
-            matmul_backward_bias4(dinp, dweight, dbias, dout, inp, weight, ones, B, T, C, OC, block_size);
+            matmul_backward_bias4(dbias, dout, B, T, C, OC, block_size);
             break;
         case 5:
-            matmul_backward_bias5(dinp, dweight, dbias, dout, inp, weight, ones, B, T, C, OC, block_size);
+            matmul_backward_bias5(dbias, dout, B, T, C, OC, block_size);
             break;
         case 7:
-            matmul_backward_bias7(dinp, dweight, dbias, dout, inp, weight, ones, B, T, C, OC, block_size);
+            matmul_backward_bias7(dbias, dout, B, T, C, OC, block_size);
             break;
         default:
             printf("Invalid kernel number\n");
@@ -366,13 +358,13 @@ int main(int argc, char **argv) {
     float* dout = make_random_float(B * T * OC);
 
     // move to GPU
-    float* d_dbias;
-    float* d_dout;
-    cudaCheck(cudaMalloc(&d_dbias, OC * sizeof(float)));
-    cudaCheck(cudaMalloc(&d_dout, B * T * OC * sizeof(float)));
+    floatX* d_dbias;
+    floatX* d_dout;
+    cudaCheck(cudaMalloc(&d_dbias, OC * sizeof(floatX)));
+    cudaCheck(cudaMalloc(&d_dout, B * T * OC * sizeof(floatX)));
     cudaCheck(cudaMalloc(&dbias_buffer, OC * sizeof(float)));
-    cudaCheck(cudaMemcpy(d_dbias, dbias, OC * sizeof(float), cudaMemcpyHostToDevice));
-    cudaCheck(cudaMemcpy(d_dout, dout, B * T * OC * sizeof(float), cudaMemcpyHostToDevice));
+    cudaCheck(memcpy_convert(d_dbias, dbias, OC));
+    cudaCheck(memcpy_convert(d_dout, dout, B * T * OC));
 
     // ncu debugging / profiling, do a single call
     // int block_size_debug;
@@ -391,23 +383,22 @@ int main(int argc, char **argv) {
     for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
         int block_size = block_sizes[j];
         // memset the bias to zero
-        cudaCheck(cudaMemset(d_dbias, 0, OC * sizeof(float)));
+        cudaCheck(cudaMemset(d_dbias, 0, OC * sizeof(floatX)));
         // calculate the GPU version
-        matmul_backward_bias(kernel_num, NULL, NULL, d_dbias, d_dout, NULL, NULL, NULL, B, T, C, OC, block_size);
+        matmul_backward_bias(kernel_num, d_dbias, d_dout, B, T, C, OC, block_size);
         // compare
         printf("Checking correctness...\n");
-        validate_result(d_dbias, dbias, "dbias", OC, 5e-3f);
+        float tol = std::is_same_v<floatX, float> ? 5e-3f : 1.0f;
+        validate_result(d_dbias, dbias, "dbias", OC, tol);
         printf("All results match for block_size=%d.\n\n", block_size);
     }
 
     // now benchmark the kernel
     for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
         int block_size = block_sizes[j];
-        float *d_dinp, *d_dweight, *d_inp, *d_weight, *d_ones;
         int repeat_times = 2000;
         float elapsed_time = benchmark_kernel(repeat_times, matmul_backward_bias, kernel_num,
-                                            d_dinp, d_dweight, d_dbias, d_dout, d_inp, d_weight, d_ones,
-                                            B, T, C, OC, block_size);
+                                            d_dbias, d_dout, B, T, C, OC, block_size);
         printf("block_size %d time %.4f ms\n", block_size, elapsed_time);
     }
 

From b3a5d1da15e63c955af840cc395930ccccfe3ee4 Mon Sep 17 00:00:00 2001
From: chinthysl <chin.dev.acc@gmail.com>
Date: Mon, 13 May 2024 05:55:46 +0000
Subject: [PATCH 059/172] shard master_weights

---
 train_gpt2.cu | 32 ++++++++++----------------------
 1 file changed, 10 insertions(+), 22 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index bd11734d9..ad4f9519c 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2372,26 +2372,23 @@ void gpt2_multi_gpu_update(GPT2 *model, float learning_rate, float beta1, float
         printf0("allocated %zu MiB for AdamW optimizer state m\n", (multi_gpu_config->shard_num_parameters * sizeof(float)) >> 20);
         printf0("allocated %zu MiB for AdamW optimizer state v\n", (multi_gpu_config->shard_num_parameters * sizeof(float)) >> 20);
         if (model->use_master_weights == 1) {
-            cudaCheck(cudaMalloc((void**)&model->master_weights, model->num_parameters * sizeof(float)));
-            copy_and_cast_kernel<<<CEIL_DIV(model->num_parameters, 512), 512, 0, main_stream>>>(model->master_weights, (floatX*)model->params_memory, model->num_parameters);
+            cudaCheck(cudaMalloc((void**)&model->master_weights, multi_gpu_config->shard_num_parameters * sizeof(float)));
+            copy_and_cast_kernel<<<CEIL_DIV(multi_gpu_config->shard_num_parameters, 512), 512, 0, main_stream>>>(
+                                                            model->master_weights, (floatX*)model->params_memory, multi_gpu_config->shard_num_parameters);
             cudaCheck(cudaGetLastError());
-            printf0("allocated %zu MiB for master copy of params\n", (model->num_parameters * sizeof(float)) >> 20);
+            printf0("allocated %zu MiB for master copy of params\n", (multi_gpu_config->shard_num_parameters * sizeof(float)) >> 20);
         }
     }
 
     floatX* params_memory = (floatX*)model->params_memory + multi_gpu_config->shard_offset;
     floatX* grads_memory = (floatX*)model->grads_memory + multi_gpu_config->shard_offset;
-    float* master_weights = NULL;
-    if (model->use_master_weights == 1) {
-        master_weights = model->master_weights + multi_gpu_config->shard_offset;
-    }
 
     int block_size = 512;
     int num_blocks = CEIL_DIV(multi_gpu_config->shard_num_parameters, block_size);
     float beta1_correction = 1.0f - powf(beta1, t);
     float beta2_correction = 1.0f - powf(beta2, t);
     unsigned int seed = random_u32(&model->rng_state);
-    adamw_kernel3<<<num_blocks, block_size, 0, main_stream>>>(params_memory, master_weights, grads_memory,
+    adamw_kernel3<<<num_blocks, block_size, 0, main_stream>>>(params_memory, model->master_weights, grads_memory,
                                                               model->m_memory, model->v_memory, multi_gpu_config->shard_num_parameters,
                                                               learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed);
     cudaCheck(cudaGetLastError());
@@ -2403,20 +2400,11 @@ void gpt2_multi_gpu_gather(GPT2 *model, MultiGpuConfig* multi_gpu_config)
     if (multi_gpu_config->num_processes == 1) return;
 
     if (multi_gpu_config->zero_stage == 1) {
-        // gather all parameter updates from each process
-        if (model->use_master_weights == 1) {
-            ncclCheck(ncclAllGather(model->master_weights + multi_gpu_config->shard_offset, model->master_weights,
-                                    multi_gpu_config->shard_num_parameters, ncclFloat,
-                                    multi_gpu_config->nccl_comm, 0));
-            // Copy and cast master weights to params 
-            copy_and_cast_kernel<<<CEIL_DIV(model->num_parameters, 512), 512>>>((floatX*)model->params_memory, model->master_weights, model->num_parameters);
-        }
-        else {
-            ncclCheck(ncclAllGather((floatX*)model->params_memory + multi_gpu_config->shard_offset, (floatX*)model->params_memory,
-                                    multi_gpu_config->shard_num_parameters, ncclFloatX,
-                                    multi_gpu_config->nccl_comm, 0));
-        }
-    }  
+        // gather updated shards of model->params_memory from each process
+        ncclCheck(ncclAllGather((floatX*)model->params_memory + multi_gpu_config->shard_offset, (floatX*)model->params_memory,
+                                multi_gpu_config->shard_num_parameters, ncclFloatX,
+                                multi_gpu_config->nccl_comm, 0));
+    }
     cudaCheck(cudaGetLastError());
 #endif
 }

From 086ce2ff4de9752b34b3f098c648230f04684e1e Mon Sep 17 00:00:00 2001
From: chinthysl <chin.dev.acc@gmail.com>
Date: Mon, 13 May 2024 06:48:22 +0000
Subject: [PATCH 060/172] Remove unsused template specializations and refactor

---
 train_gpt2.cu | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index ad4f9519c..8a743ad80 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -1282,16 +1282,6 @@ __device__ float cast_value<float, half>(half val) {
     return __half2float(val);
 }
 
-template<>
-__device__ half cast_value<half, float>(float val) {
-    return __float2half(val);
-}
-
-template<>
-__device__ __nv_bfloat16 cast_value<__nv_bfloat16, float>(float val) {
-    return __float2bfloat16(val);
-}
-
 template<>
 __device__ float cast_value<float, __nv_bfloat16>(__nv_bfloat16 val) {
     return __bfloat162float(val);
@@ -2302,7 +2292,6 @@ void gpt2_backward(GPT2 *model) {
 // Compute a mean of a single CPU value across all GPU processes. No-op when multi-GPU is disabled.
 float multi_gpu_cpu_float_mean(float value, const MultiGpuConfig* multi_gpu_config) {
 #ifdef MULTI_GPU
-    if (multi_gpu_config->num_processes == 1) return value;
     // MPI doesn't support all reduce with mean, so we sum up, then divide.
     float result;
     mpiCheck(MPI_Allreduce(&value, &result, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD));
@@ -2315,11 +2304,11 @@ float multi_gpu_cpu_float_mean(float value, const MultiGpuConfig* multi_gpu_conf
 // Averages out the loss and gradients across all GPUs. No-op when multi-GPU is disabled.
 // todo - this version only works if all the parameters are the same size (floatX)
 void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) {
+#ifdef MULTI_GPU
     NVTX_RANGE_FN();
+    if (multi_gpu_config->num_processes == 1) return;
     // Average all losses.
     model->accumulated_mean_loss = multi_gpu_cpu_float_mean(model->mean_loss, multi_gpu_config);
-#ifdef MULTI_GPU
-    if (multi_gpu_config->num_processes == 1) return;
     // Average all gradients.
     ncclCheck(ncclAllReduce(model->grads_memory, model->grads_memory,
         model->num_parameters,

From f613ce895b30dc0b2bd1f7e81410c6a2dcdce74d Mon Sep 17 00:00:00 2001
From: chinthysl <chin.dev.acc@gmail.com>
Date: Mon, 13 May 2024 09:13:02 +0000
Subject: [PATCH 061/172] Fix copy and cast params to master weights

---
 train_gpt2.cu | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 8a743ad80..830e644e6 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2353,32 +2353,32 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
 
 void gpt2_multi_gpu_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t, MultiGpuConfig* multi_gpu_config) {
     NVTX_RANGE_FN();
+    size_t num_parameters = multi_gpu_config->shard_num_parameters;
+    floatX* params_memory = (floatX*)model->params_memory + multi_gpu_config->shard_offset;
+    floatX* grads_memory = (floatX*)model->grads_memory + multi_gpu_config->shard_offset;
+
     if (model->m_memory == NULL) {
-        cudaCheck(cudaMalloc((void**)&model->m_memory, multi_gpu_config->shard_num_parameters * sizeof(float)));
-        cudaCheck(cudaMalloc((void**)&model->v_memory, multi_gpu_config->shard_num_parameters* sizeof(float)));
-        cudaCheck(cudaMemset(model->m_memory, 0, multi_gpu_config->shard_num_parameters * sizeof(float)));
-        cudaCheck(cudaMemset(model->v_memory, 0, multi_gpu_config->shard_num_parameters * sizeof(float)));
-        printf0("allocated %zu MiB for AdamW optimizer state m\n", (multi_gpu_config->shard_num_parameters * sizeof(float)) >> 20);
-        printf0("allocated %zu MiB for AdamW optimizer state v\n", (multi_gpu_config->shard_num_parameters * sizeof(float)) >> 20);
+        cudaCheck(cudaMalloc((void**)&model->m_memory, num_parameters * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&model->v_memory, num_parameters * sizeof(float)));
+        cudaCheck(cudaMemset(model->m_memory, 0, num_parameters * sizeof(float)));
+        cudaCheck(cudaMemset(model->v_memory, 0, num_parameters * sizeof(float)));
+        printf0("allocated %zu MiB for AdamW optimizer state m\n", (num_parameters * sizeof(float)) >> 20);
+        printf0("allocated %zu MiB for AdamW optimizer state v\n", (num_parameters * sizeof(float)) >> 20);
         if (model->use_master_weights == 1) {
-            cudaCheck(cudaMalloc((void**)&model->master_weights, multi_gpu_config->shard_num_parameters * sizeof(float)));
-            copy_and_cast_kernel<<<CEIL_DIV(multi_gpu_config->shard_num_parameters, 512), 512, 0, main_stream>>>(
-                                                            model->master_weights, (floatX*)model->params_memory, multi_gpu_config->shard_num_parameters);
+            cudaCheck(cudaMalloc((void**)&model->master_weights, num_parameters * sizeof(float)));
+            copy_and_cast_kernel<<<CEIL_DIV(num_parameters, 512), 512, 0, main_stream>>>(model->master_weights, params_memory, num_parameters);
             cudaCheck(cudaGetLastError());
-            printf0("allocated %zu MiB for master copy of params\n", (multi_gpu_config->shard_num_parameters * sizeof(float)) >> 20);
+            printf0("allocated %zu MiB for master copy of params\n", (num_parameters * sizeof(float)) >> 20);
         }
     }
 
-    floatX* params_memory = (floatX*)model->params_memory + multi_gpu_config->shard_offset;
-    floatX* grads_memory = (floatX*)model->grads_memory + multi_gpu_config->shard_offset;
-
     int block_size = 512;
-    int num_blocks = CEIL_DIV(multi_gpu_config->shard_num_parameters, block_size);
+    int num_blocks = CEIL_DIV(num_parameters, block_size);
     float beta1_correction = 1.0f - powf(beta1, t);
     float beta2_correction = 1.0f - powf(beta2, t);
     unsigned int seed = random_u32(&model->rng_state);
     adamw_kernel3<<<num_blocks, block_size, 0, main_stream>>>(params_memory, model->master_weights, grads_memory,
-                                                              model->m_memory, model->v_memory, multi_gpu_config->shard_num_parameters,
+                                                              model->m_memory, model->v_memory, num_parameters,
                                                               learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed);
     cudaCheck(cudaGetLastError());
 }

From c0329ebdba7c33b3703d359dc4f214ae986479c1 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Mon, 13 May 2024 17:15:48 +0300
Subject: [PATCH 062/172] new kernel version with fewer atomics

---
 dev/cuda/matmul_backward_bias.cu | 82 ++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu
index 024eca4d6..9bd4b0428 100644
--- a/dev/cuda/matmul_backward_bias.cu
+++ b/dev/cuda/matmul_backward_bias.cu
@@ -243,6 +243,73 @@ __global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, i
     }
 }
 
+// We want to decrease the amount of channels handled by each block, so that we need fewer across-block reductions.
+// We do this by realizing the following: For scalar memory access, we need to read one element per thread in a warp
+// to read an entire cacheline, but for vectorized memory access, with 128 bit of data per thread, we only need eight
+// threads to fetch a cacheline, which means that we can already operate on a "depth" of four within a single warp.
+// => blockDim.x == 4, blockDim.y == 32/4 = 8
+//
+__global__ void matmul_backward_bias_kernel8(float* dbias, const floatX* dout, int B, int T, int OC) {
+    constexpr const int bdx = 4;
+    constexpr const int bdy = 32  / bdx;
+    assert(blockDim.x == bdx);
+    assert(blockDim.y == bdy);
+
+    int warp_d = (int)threadIdx.x;
+    int warp_c = (int)threadIdx.y;
+    int block_d = (int)threadIdx.z;
+
+    const int OC_per_warp = bdy * x128::size;  // 256 at BF16
+
+    int local_oc = warp_c * x128::size;
+    int global_oc = blockIdx.x * OC_per_warp + local_oc;
+
+    int local_bt = warp_d + bdx * block_d;
+    int bt_per_block = bdx * blockDim.z;
+
+    float accumulators[x128::size];
+    for (int k = 0; k < x128::size; k++) {
+        accumulators[k] = 0.0f;
+    }
+
+    if(global_oc < OC) {
+        // sum up over all bt within registers
+        for (int idx = blockIdx.y * bt_per_block + local_bt; idx < B * T; idx += gridDim.y * bt_per_block) {
+            x128 packed_dout = load128(dout + global_oc + idx*OC);
+            for (int k = 0; k < x128::size; k++) {
+                accumulators[k] += (float)packed_dout[k];
+            }
+        }
+    }
+
+    __shared__ float sub_results[x128::size][32][bdy];
+
+    // reduce within-warp results
+    for (int k = 0; k < x128::size; k++) {
+        float v = accumulators[k];
+        v += __shfl_down_sync(0xffffffff, v, 1, 4);
+        v += __shfl_down_sync(0xffffffff, v, 2, 4);
+        if(warp_d == 0) {
+            sub_results[k][block_d][warp_c] = v;
+        }
+    }
+    __syncthreads();
+
+    // block-wide reductions
+    for (int k = block_d; k < x128::size; k += blockDim.z) {
+        float a = 0.f;
+        for (int r = warp_d; r < blockDim.z; r += bdx) {
+            float v = sub_results[k][r][warp_c];
+            v += __shfl_down_sync(0xffffffff, v, 1, 4);
+            v += __shfl_down_sync(0xffffffff, v, 2, 4);
+            a += v;
+        }
+        if(warp_d == 0 && global_oc < OC) {
+            // coalesced, but not cacheline-sized
+            atomicAdd(dbias + global_oc + k, a);
+        }
+    }
+}
 
 // ----------------------------------------------------------------------------
 // kernel launcher
@@ -309,6 +376,18 @@ void matmul_backward_bias7(floatX* dbias, floatX* dout,
     cast_and_add_kernel<<<ceil_div(OC, 256), 256, 0>>>(dbias, dbias_buffer, OC);
 }
 
+void matmul_backward_bias8(floatX* dbias, floatX* dout,
+                      int B, int T, int C, int OC, int block_size) {
+    dim3 block_dim = {4, 8, (unsigned)block_size/32};
+    const int OC_per_warp = block_dim.y * x128::size; // 64 at BF16
+    const int grid_size_x = ceil_div(OC, OC_per_warp); // e.g. 12 horizontal blocks for 768 OCs at BF16
+    const int grid_size_y = max(1, cuda_threads_per_SM * cuda_num_SMs / (block_size * grid_size_x)); // full GPU!
+
+    cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float));
+    matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias_buffer, dout, B, T, OC);
+    cast_and_add_kernel<<<ceil_div(OC, 256), 256, 0>>>(dbias, dbias_buffer, OC);
+}
+
 void matmul_backward_bias(int kernel_num, floatX* dbias, floatX* dout,
                      int B, int T, int C, int OC, int block_size) {
     switch (kernel_num) {
@@ -330,6 +409,9 @@ void matmul_backward_bias(int kernel_num, floatX* dbias, floatX* dout,
         case 7:
             matmul_backward_bias7(dbias, dout, B, T, C, OC, block_size);
             break;
+        case 8:
+            matmul_backward_bias8(dbias, dout, B, T, C, OC, block_size);
+            break;
         default:
             printf("Invalid kernel number\n");
             exit(1);

From 081d224b21c5991d548b5e5acf2fcfd96901036f Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Mon, 13 May 2024 17:39:32 +0300
Subject: [PATCH 063/172] automatically switch to buffer-less version if that
 can fill up the GPU

---
 dev/cuda/matmul_backward_bias.cu | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu
index 9bd4b0428..3d5484df2 100644
--- a/dev/cuda/matmul_backward_bias.cu
+++ b/dev/cuda/matmul_backward_bias.cu
@@ -249,7 +249,8 @@ __global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, i
 // threads to fetch a cacheline, which means that we can already operate on a "depth" of four within a single warp.
 // => blockDim.x == 4, blockDim.y == 32/4 = 8
 //
-__global__ void matmul_backward_bias_kernel8(float* dbias, const floatX* dout, int B, int T, int OC) {
+template<typename OutFloat>
+__global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout, int B, int T, int OC) {
     constexpr const int bdx = 4;
     constexpr const int bdy = 32  / bdx;
     assert(blockDim.x == bdx);
@@ -306,7 +307,11 @@ __global__ void matmul_backward_bias_kernel8(float* dbias, const floatX* dout, i
         }
         if(warp_d == 0 && global_oc < OC) {
             // coalesced, but not cacheline-sized
-            atomicAdd(dbias + global_oc + k, a);
+            if constexpr (std::is_same_v<OutFloat, floatX>) {
+                dbias[global_oc + k] = a;
+            } else {
+                atomicAdd(dbias + global_oc + k, a);
+            }
         }
     }
 }
@@ -383,9 +388,15 @@ void matmul_backward_bias8(floatX* dbias, floatX* dout,
     const int grid_size_x = ceil_div(OC, OC_per_warp); // e.g. 12 horizontal blocks for 768 OCs at BF16
     const int grid_size_y = max(1, cuda_threads_per_SM * cuda_num_SMs / (block_size * grid_size_x)); // full GPU!
 
-    cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float));
-    matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias_buffer, dout, B, T, OC);
-    cast_and_add_kernel<<<ceil_div(OC, 256), 256, 0>>>(dbias, dbias_buffer, OC);
+    // If we have enough OC that we don't need cross-block reductions, we can skip the bias_buffer accumulation
+    // and write results directly to the output.
+    if(grid_size_y == 1) {
+        matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias, dout, B, T, OC);
+    } else {
+        cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float));
+        matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias_buffer, dout, B, T, OC);
+        cast_and_add_kernel<<<ceil_div(OC, 256), 256, 0>>>(dbias, dbias_buffer, OC);
+    }
 }
 
 void matmul_backward_bias(int kernel_num, floatX* dbias, floatX* dout,
@@ -457,7 +468,7 @@ int main(int argc, char **argv) {
     // matmul_backward_bias(kernel_num, NULL, NULL, d_dbias, d_dout, NULL, NULL, NULL, B, T, C, OC, block_size_debug);
     // exit(EXIT_SUCCESS);
 
-    int block_sizes[] = {32, 64, 128, 256, 512, 1024};
+    int block_sizes[] = {32, 64, 128, 256, 512, 768, 1024};
 
     // calculate the CPU reference
     matmul_backward_bias_cpu(NULL, NULL, dbias, dout, NULL, NULL, B, T, C, OC);

From aa41b3262480f2288e80d7f3f55efb03cdef2701 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Mon, 13 May 2024 17:48:25 +0300
Subject: [PATCH 064/172] update main file

---
 train_gpt2.cu | 130 +++++++++++++++++++++++++++-----------------------
 1 file changed, 70 insertions(+), 60 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 1c6cdb136..5a08cd249 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -865,57 +865,70 @@ __global__ void gelu_backward_kernel(floatX* dinp, const floatX* inp, const floa
     store128(dinp + idx, packed_dinp);
 }
 
-__global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, int B, int T, int OC) {
-    // note: this kernel reads in floatX, but it writes to float!
-    // this is because we're using atomics, which are super slow in < fp32 precision on < H100 GPUs
-    // so the trick is do fp32 atomics to a buffer, and then copy_and_cast the result to floatX
-    // (this also results in higher accuracy than doing accumulation directly in floatX)
+template<typename OutFloat>
+__global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout, int B, int T, int OC) {
+    constexpr const int bdx = 4;
+    constexpr const int bdy = 32  / bdx;
+    assert(blockDim.x == bdx);
+    assert(blockDim.y == bdy);
 
-    // see comments in matmul_backward() for an explanation of block/grid dimensions etc.
-    const int block_size = 512;
-    const int block_size_x = 32;
-    const int block_size_y = block_size / block_size_x; // 16
-    const int OC_per_warp = block_size_x * x128::size;  // 256 at BF16
+    int warp_d = (int)threadIdx.x;
+    int warp_c = (int)threadIdx.y;
+    int block_d = (int)threadIdx.z;
+
+    const int OC_per_warp = bdy * x128::size;  // 256 at BF16
 
-    int local_oc = threadIdx.x * x128::size;
+    int local_oc = warp_c * x128::size;
     int global_oc = blockIdx.x * OC_per_warp + local_oc;
-    float accumulators[x128::size];
-    __shared__ float shared[OC_per_warp];
 
+    int local_bt = warp_d + bdx * block_d;
+    int bt_per_block = bdx * blockDim.z;
+
+    float accumulators[x128::size];
     for (int k = 0; k < x128::size; k++) {
         accumulators[k] = 0.0f;
     }
-    int thread_id = threadIdx.y * block_size_x + threadIdx.x;
-    for (int idx = thread_id; idx < OC_per_warp; idx += block_size) {
-        shared[idx] = 0.0f;
-    }
-    __syncthreads();
+
     if(global_oc < OC) {
-        for (int idx = blockIdx.y*block_size_y + threadIdx.y; idx < B * T; idx += gridDim.y*block_size_y) {
+        // sum up over all bt within registers
+        for (int idx = blockIdx.y * bt_per_block + local_bt; idx < B * T; idx += gridDim.y * bt_per_block) {
             x128 packed_dout = load128(dout + global_oc + idx*OC);
             for (int k = 0; k < x128::size; k++) {
                 accumulators[k] += (float)packed_dout[k];
             }
-	}
-	// we need to avoid shared memory bank conflicts for the atomicAdd to maximise performance,
-	// so we accumulate in a conflict-free order, then reorder to match the global memory order
-	for (int k = 0; k < x128::size; k++) {
-            atomicAdd(shared + threadIdx.x + (k * block_size_x), accumulators[k]);
-	}
-    }
-    if (threadIdx.y >= x128::size) { return; } // only need this many warps to reorder the data
-    __syncthreads();
-    // read the accumulated values in the conflict-free order
-    int i = threadIdx.x + (threadIdx.y * block_size_x);
-    float tmp = shared[i];
-    __syncthreads();
-    // write them back to shared memory in the global memory order
-    // 8-way bank conflict for BF16 x128, but only 8x per threadblock (rather than 8x per warp)
-    shared[local_oc + threadIdx.y] = tmp;
+        }
+    }
+
+    __shared__ float sub_results[x128::size][32][bdy];
+
+    // reduce within-warp results
+    for (int k = 0; k < x128::size; k++) {
+        float v = accumulators[k];
+        v += __shfl_down_sync(0xffffffff, v, 1, 4);
+        v += __shfl_down_sync(0xffffffff, v, 2, 4);
+        if(warp_d == 0) {
+            sub_results[k][block_d][warp_c] = v;
+        }
+    }
     __syncthreads();
-    // now we do a perfectly coalesced atomic add to global memory (1x 128-byte cacheline per warp)
-    if (i + blockIdx.x*OC_per_warp < OC) {
-        atomicAdd(dbias + i + blockIdx.x*OC_per_warp, shared[i]);
+
+    // block-wide reductions
+    for (int k = block_d; k < x128::size; k += blockDim.z) {
+        float a = 0.f;
+        for (int r = warp_d; r < blockDim.z; r += bdx) {
+            float v = sub_results[k][r][warp_c];
+            v += __shfl_down_sync(0xffffffff, v, 1, 4);
+            v += __shfl_down_sync(0xffffffff, v, 2, 4);
+            a += v;
+        }
+        if(warp_d == 0 && global_oc < OC) {
+            // coalesced, but not cacheline-sized
+            if constexpr (std::is_same_v<OutFloat, floatX>) {
+                dbias[global_oc + k] = a;
+            } else {
+                atomicAdd(dbias + global_oc + k, a);
+            }
+        }
     }
 }
 
@@ -1462,28 +1475,25 @@ void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias,
 
     // backward to bias, if given, does a +=
     if (dbias != NULL) {
-        // Each warp is responsible for 32 * "x128::size" = 256 OCs at BF16 (OC must be a multiple of 256!)
-        // Block size is 512 threads (16 warps) and we reduce those 16 values into 1 at the end
-        // blockDim.x is 32 --> single warp being responsible for those 256 OCs
-        // blockDim.y is 16 --> 16 parallel independent warps processing the same OCs for different BTs
-        // gridDim.x is OC / 256 --> each block processes 256 OCs
-        // grimDim.y is max(1, (cuda_num_SMs * threads_per_SM) / (512 * gridDim.x)); --> fill up the entire GPU!
-        const int warp_size = 32;
-        const int block_size = 512;
-        const int OC_per_warp = warp_size * x128::size; // 256 at BF16
-        const int block_size_x = 32;
-        const int block_size_y = block_size / block_size_x; // 16
-        const int grid_size_x = CEIL_DIV(OC, OC_per_warp); // e.g. 3 horizontal blocks for 768 OCs at BF16
-        const int grid_size_y = max(1, deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount
-                                     / (block_size * grid_size_x)); // full GPU!
-
-        assert(block_size_y >= x128::size); // part of the kernel assumes this is large enough to avoid loops
-
-        cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float), main_stream);
-        matmul_backward_bias_kernel7<<<dim3(grid_size_x, grid_size_y),
-                                       dim3(block_size_x, block_size_y),
-                                       OC_per_warp * sizeof(float), main_stream>>>(dbias_buffer, dout, B, T, OC);
-        cast_and_add_kernel<<<CEIL_DIV(OC, 256), 256, 0, main_stream>>>(dbias, dbias_buffer, OC);
+        // Each warp is responsible for 8 * "x128::size" = 64 OCs at BF16 (OC must be a multiple of 64!)
+        // Block size is 1024 | 768 threads (32|24 warps) and we reduce those values into 1 at the end
+
+        const int block_size = deviceProp.maxThreadsPerMultiProcessor == 1536 ? 768 : 1024;
+
+        dim3 block_dim = {4, 8, (unsigned)block_size/32};
+        const int OC_per_warp = block_dim.y * x128::size; // 64 at BF16
+        const int grid_size_x = CEIL_DIV(OC, OC_per_warp); // e.g. 12 horizontal blocks for 768 OCs at BF16
+        const int grid_size_y = max(1, deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount / (block_size * grid_size_x)); // full GPU!
+
+        // If we have enough OC that we don't need cross-block reductions, we can skip the bias_buffer accumulation
+        // and write results directly to the output.
+        if(grid_size_y == 1) {
+            matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim, 0, main_stream>>>(dbias, dout, B, T, OC);
+        } else {
+            cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float), main_stream);
+            matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim, 0, main_stream>>>(dbias_buffer, dout, B, T, OC);
+            cast_and_add_kernel<<<CEIL_DIV(OC, 256), 256, 0, main_stream>>>(dbias, dbias_buffer, OC);
+        }
     }
 
     // backward to input, uses = in the backward pass (set the gradient)

From 49ee3c830773903215279bdd4840d80b8df1dd1d Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Mon, 13 May 2024 18:27:56 +0300
Subject: [PATCH 065/172] fix non-atomic version: * accumulate instead of
 assign * need dedicated argument to correctly handle the floatX == float case

---
 dev/cuda/matmul_backward_bias.cu | 14 ++++++++------
 train_gpt2.cu                    | 21 ++++++++++++++-------
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu
index 3d5484df2..155c20098 100644
--- a/dev/cuda/matmul_backward_bias.cu
+++ b/dev/cuda/matmul_backward_bias.cu
@@ -22,6 +22,7 @@ sudo ncu --set full --import-source yes -o bias -f ./matmul_backward_bias 1
 #include <omp.h>
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
+#include <type_traits>
 
 #define ENABLE_BF16
 #include "common.h"
@@ -249,8 +250,9 @@ __global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, i
 // threads to fetch a cacheline, which means that we can already operate on a "depth" of four within a single warp.
 // => blockDim.x == 4, blockDim.y == 32/4 = 8
 //
-template<typename OutFloat>
-__global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout, int B, int T, int OC) {
+template<typename OutFloat, bool Atomic>
+__global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout, int B, int T, int OC,
+                                             std::bool_constant<Atomic>) {
     constexpr const int bdx = 4;
     constexpr const int bdy = 32  / bdx;
     assert(blockDim.x == bdx);
@@ -307,8 +309,8 @@ __global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout
         }
         if(warp_d == 0 && global_oc < OC) {
             // coalesced, but not cacheline-sized
-            if constexpr (std::is_same_v<OutFloat, floatX>) {
-                dbias[global_oc + k] = a;
+            if constexpr (!Atomic) {
+                dbias[global_oc + k] = (OutFloat)(a + (float)dbias[global_oc + k]);
             } else {
                 atomicAdd(dbias + global_oc + k, a);
             }
@@ -391,10 +393,10 @@ void matmul_backward_bias8(floatX* dbias, floatX* dout,
     // If we have enough OC that we don't need cross-block reductions, we can skip the bias_buffer accumulation
     // and write results directly to the output.
     if(grid_size_y == 1) {
-        matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias, dout, B, T, OC);
+        matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias, dout, B, T, OC, std::bool_constant<false>{});
     } else {
         cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float));
-        matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias_buffer, dout, B, T, OC);
+        matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias_buffer, dout, B, T, OC, std::bool_constant<true>{});
         cast_and_add_kernel<<<ceil_div(OC, 256), 256, 0>>>(dbias, dbias_buffer, OC);
     }
 }
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 5a08cd249..5a553eda9 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -865,8 +865,13 @@ __global__ void gelu_backward_kernel(floatX* dinp, const floatX* inp, const floa
     store128(dinp + idx, packed_dinp);
 }
 
-template<typename OutFloat>
-__global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout, int B, int T, int OC) {
+// templated because if we have enough channels, we can write directly to the bf16 dbias buffer, and otherwise
+// we need to write to a fp32 temp buffer. The `Atomic` argument indicates whether we add atomically. We cannot
+// (easily) use a regular runtime `if(blockDim.y == 1)` runtime condition, because that doesn't compile for older
+// GPUs.
+template<typename OutFloat, bool Atomic>
+__global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout, int B, int T, int OC,
+                                             std::bool_constant<Atomic>) {
     constexpr const int bdx = 4;
     constexpr const int bdy = 32  / bdx;
     assert(blockDim.x == bdx);
@@ -921,10 +926,12 @@ __global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout
             v += __shfl_down_sync(0xffffffff, v, 2, 4);
             a += v;
         }
+
+        // coalesced, but not cacheline-sized writes
         if(warp_d == 0 && global_oc < OC) {
-            // coalesced, but not cacheline-sized
-            if constexpr (std::is_same_v<OutFloat, floatX>) {
-                dbias[global_oc + k] = a;
+            // if we have only one block per result, no need for atomics
+            if constexpr (!Atomic) {
+                dbias[global_oc + k] = (OutFloat)(a + (float)dbias[global_oc + k]);
             } else {
                 atomicAdd(dbias + global_oc + k, a);
             }
@@ -1488,10 +1495,10 @@ void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias,
         // If we have enough OC that we don't need cross-block reductions, we can skip the bias_buffer accumulation
         // and write results directly to the output.
         if(grid_size_y == 1) {
-            matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim, 0, main_stream>>>(dbias, dout, B, T, OC);
+            matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim, 0, main_stream>>>(dbias, dout, B, T, OC, std::bool_constant<false>{});
         } else {
             cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float), main_stream);
-            matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim, 0, main_stream>>>(dbias_buffer, dout, B, T, OC);
+            matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim, 0, main_stream>>>(dbias_buffer, dout, B, T, OC, std::bool_constant<true>{});
             cast_and_add_kernel<<<CEIL_DIV(OC, 256), 256, 0, main_stream>>>(dbias, dbias_buffer, OC);
         }
     }

From 65727d5a4da65eebe9b5dbee01201f9eac492b93 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Mon, 13 May 2024 19:20:03 +0300
Subject: [PATCH 066/172] fix CI compile by disabling kernel 5

---
 dev/cuda/matmul_backward_bias.cu | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu
index 155c20098..66a59801f 100644
--- a/dev/cuda/matmul_backward_bias.cu
+++ b/dev/cuda/matmul_backward_bias.cu
@@ -172,6 +172,7 @@ __global__ void matmul_backward_bias_kernel4(floatX* dbias, const floatX* dout,
     }
 }
 
+#ifndef ENABLE_BF16
 __global__ void matmul_backward_bias_kernel5(floatX* dbias, const floatX* dout, int B, int T, int OC) {
     int oc = blockIdx.x * blockDim.x + threadIdx.x;
     if(oc >= OC) return;
@@ -183,6 +184,7 @@ __global__ void matmul_backward_bias_kernel5(floatX* dbias, const floatX* dout,
     // and atomically add everything together. atomics within one block are conflict-free!
     atomicAdd(dbias + oc, sum);
 }
+#endif
 
 
 __global__ void cast_and_add_kernel(floatX* dst, const float* src, size_t n) {
@@ -350,12 +352,14 @@ void matmul_backward_bias4(floatX* dbias, floatX* dout,
     matmul_backward_bias_kernel4<<<grid_size, block_size, block_size * sizeof(float)>>>(dbias, dout, B, T, OC);
 }
 
+#ifndef ENABLE_BF16
 void matmul_backward_bias5(floatX* dbias, floatX* dout,
                       int B, int T, int C, int OC, int block_size) {
     const int grid_size_x = ceil_div(OC, block_size);
     const int grid_size_y = max(1, cuda_threads_per_SM * cuda_num_SMs / block_size);
     matmul_backward_bias_kernel5<<<dim3(grid_size_x, grid_size_y), dim3(block_size)>>>(dbias, dout, B, T, OC);
 }
+#endif
 
 void matmul_backward_bias7(floatX* dbias, floatX* dout,
                       int B, int T, int C, int OC, int block_size) {
@@ -417,7 +421,12 @@ void matmul_backward_bias(int kernel_num, floatX* dbias, floatX* dout,
             matmul_backward_bias4(dbias, dout, B, T, C, OC, block_size);
             break;
         case 5:
+#ifndef ENABLE_BF16
             matmul_backward_bias5(dbias, dout, B, T, C, OC, block_size);
+#else
+            fprintf(stderr, "Kernel 5 is only supported for fp32");
+            exit(1);
+#endif
             break;
         case 7:
             matmul_backward_bias7(dbias, dout, B, T, C, OC, block_size);

From c66e48c06c1a64b5de55fd37bd67531f8fcbcc85 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Mon, 13 May 2024 20:58:07 +0300
Subject: [PATCH 067/172] fixup comment

---
 dev/cuda/matmul_backward_bias.cu | 4 ++--
 train_gpt2.cu                    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu
index 66a59801f..0bf5e44dd 100644
--- a/dev/cuda/matmul_backward_bias.cu
+++ b/dev/cuda/matmul_backward_bias.cu
@@ -256,7 +256,7 @@ template<typename OutFloat, bool Atomic>
 __global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout, int B, int T, int OC,
                                              std::bool_constant<Atomic>) {
     constexpr const int bdx = 4;
-    constexpr const int bdy = 32  / bdx;
+    constexpr const int bdy = 32 / bdx;
     assert(blockDim.x == bdx);
     assert(blockDim.y == bdy);
 
@@ -264,7 +264,7 @@ __global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout
     int warp_c = (int)threadIdx.y;
     int block_d = (int)threadIdx.z;
 
-    const int OC_per_warp = bdy * x128::size;  // 256 at BF16
+    const int OC_per_warp = bdy * x128::size;  // 64 at BF16
 
     int local_oc = warp_c * x128::size;
     int global_oc = blockIdx.x * OC_per_warp + local_oc;
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 5a553eda9..dd98ea000 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -873,7 +873,7 @@ template<typename OutFloat, bool Atomic>
 __global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout, int B, int T, int OC,
                                              std::bool_constant<Atomic>) {
     constexpr const int bdx = 4;
-    constexpr const int bdy = 32  / bdx;
+    constexpr const int bdy = 32 / bdx;
     assert(blockDim.x == bdx);
     assert(blockDim.y == bdy);
 
@@ -881,7 +881,7 @@ __global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout
     int warp_c = (int)threadIdx.y;
     int block_d = (int)threadIdx.z;
 
-    const int OC_per_warp = bdy * x128::size;  // 256 at BF16
+    const int OC_per_warp = bdy * x128::size;  // 64 at BF16
 
     int local_oc = warp_c * x128::size;
     int global_oc = blockIdx.x * OC_per_warp + local_oc;

From dd8c9f5ec9bf268cd0ed562a4f07214d0bfa1199 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Tue, 14 May 2024 20:43:51 +0300
Subject: [PATCH 068/172] fix layernorm backward: accumulate weight gradient

---
 dev/cuda/layernorm_backward.cu | 10 ++++++----
 train_gpt2.cu                  | 10 ++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/dev/cuda/layernorm_backward.cu b/dev/cuda/layernorm_backward.cu
index 575e0a962..e22155247 100644
--- a/dev/cuda/layernorm_backward.cu
+++ b/dev/cuda/layernorm_backward.cu
@@ -842,11 +842,13 @@ __global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS)
             int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration);
             int shared_index = warpThreadIdx + (i * C_per_iteration);
 
-            x128 dbias128;
-            x128 dweight128;
+            x128 dbias128 = load128(dbias + global_index);
+            x128 dweight128 = load128(dweight + global_index);
             for (int x = 0; x < x128::size; x++) {
-                dbias128[x] = (floatX)scratch_dbias[shared_index + x*warpSize];
-                dweight128[x] = (floatX)scratch_dweight[shared_index + x*warpSize];
+                float s_db = scratch_dbias[shared_index + x*warpSize];
+                float s_dw = scratch_dweight[shared_index + x*warpSize];
+                dbias128[x] = (floatX)(s_db + (float)dbias128[x]);
+                dweight128[x] = (floatX)(s_dw + (float)dweight128[x]);
             }
             store128(dbias + global_index, dbias128);
             store128(dweight + global_index, dweight128);
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 3285aa1d5..469a0d6ac 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -1029,11 +1029,13 @@ __global__ void __launch_bounds__(512, 3) // todo - any warnings on Turing with
             int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration);
             int shared_index = warpThreadIdx + (i * C_per_iteration);
 
-            x128 dbias128;
-            x128 dweight128;
+            x128 dbias128 = load128(dbias + global_index);
+            x128 dweight128 = load128(dweight + global_index);
             for (int x = 0; x < x128::size; x++) {
-                dbias128[x] = (floatX)scratch_dbias[shared_index + x*warpSize];
-                dweight128[x] = (floatX)scratch_dweight[shared_index + x*warpSize];
+                float s_db = scratch_dbias[shared_index + x*warpSize];
+                float s_dw = scratch_dweight[shared_index + x*warpSize];
+                dbias128[x] = (floatX)(s_db + (float)dbias128[x]);
+                dweight128[x] = (floatX)(s_dw + (float)dweight128[x]);
             }
             store128(dbias + global_index, dbias128);
             store128(dweight + global_index, dweight128);

From e553e2f084b29bbf7a59006de593ac311d60fc19 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Tue, 14 May 2024 20:44:50 +0300
Subject: [PATCH 069/172] update dev/cuda/layernorm_backward and improve
 `validate_result` to take into account fp epsilon when comparing results

---
 dev/cuda/common.h              | 22 +++++++++-
 dev/cuda/layernorm_backward.cu | 76 +++++++++++++---------------------
 2 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/dev/cuda/common.h b/dev/cuda/common.h
index 0c2079821..2757c67b5 100644
--- a/dev/cuda/common.h
+++ b/dev/cuda/common.h
@@ -3,6 +3,7 @@
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 #include <cublasLt.h>
+#include <float.h>
 
 
 template<class T>
@@ -260,13 +261,25 @@ void validate_result(D* device_result, const T* cpu_reference, const char* name,
     D* out_gpu = (D*)malloc(num_elements * sizeof(D));
     cudaCheck(cudaMemcpy(out_gpu, device_result, num_elements * sizeof(D), cudaMemcpyDeviceToHost));
     int nfaults = 0;
+#ifndef ENABLE_BF16
+    float epsilon = FLT_EPSILON;
+#else
+    float epsilon = 0.079;
+#endif
     for (int i = 0; i < num_elements; i++) {
+        // Skip masked elements
+        if(!isfinite(cpu_reference[i]))
+            continue;
+
         // print the first few comparisons
         if (i < 5) {
             printf("%f %f\n", cpu_reference[i], (T)out_gpu[i]);
         }
-        // ensure correctness for all elements. We can set an "ignore" mask by writing NaN
-        if (fabs(cpu_reference[i] - (T)out_gpu[i]) > tolerance && isfinite(cpu_reference[i])) {
+        // effective tolerance is based on expected rounding error (epsilon),
+        // plus any specified additional tolerance
+        float t_eff = tolerance + fabs(cpu_reference[i]) * epsilon;
+        // ensure correctness for all elements.
+        if (fabs(cpu_reference[i] - (T)out_gpu[i]) > t_eff) {
             printf("Mismatch of %s at %d: CPU_ref: %f vs GPU: %f\n", name, i, cpu_reference[i], (T)out_gpu[i]);
             nfaults ++;
             if (nfaults >= 10) {
@@ -276,6 +289,11 @@ void validate_result(D* device_result, const T* cpu_reference, const char* name,
         }
     }
 
+    if (nfaults > 0) {
+        free(out_gpu);
+        exit(EXIT_FAILURE);
+    }
+
     // reset the result pointer, so we can chain multiple tests and don't miss trivial errors,
     // like the kernel not writing to part of the result.
     // cudaMemset(device_result, 0, num_elements * sizeof(T));
diff --git a/dev/cuda/layernorm_backward.cu b/dev/cuda/layernorm_backward.cu
index e22155247..90dcb1674 100644
--- a/dev/cuda/layernorm_backward.cu
+++ b/dev/cuda/layernorm_backward.cu
@@ -1014,25 +1014,6 @@ int main(int argc, char **argv) {
     float *dbias = make_zeros_float(C);
     layernorm_backward_cpu(dinp, dweight, dbias, dout, inp, weight, mean, rstd, B, T, C);
 
-    // convert all the necessary cpu data to floatX (e.g. bfloat16)
-    floatX* meanX = (floatX*)malloc(B * T * sizeof(floatX));
-    floatX* rstdX = (floatX*)malloc(B * T * sizeof(floatX));
-    floatX* doutX = (floatX*)malloc(B * T * C * sizeof(floatX));
-    floatX* inpX = (floatX*)malloc(B * T * C * sizeof(floatX));
-    floatX* weightX = (floatX*)malloc(C * sizeof(floatX));
-
-    for (int i = 0; i < B * T; i++) {
-        meanX[i] = (floatX)mean[i];
-        rstdX[i] = (floatX)rstd[i];
-    }
-    for (int i = 0; i < B * T * C; i++) {
-        doutX[i] = (floatX)dout[i];
-        inpX[i] = (floatX)inp[i];
-    }
-    for (int i = 0; i < C; i++) {
-        weightX[i] = (floatX)weight[i];
-    }
-
     // the above calculations act as the reference
     // now let's do the same on the GPU
 
@@ -1063,33 +1044,39 @@ int main(int argc, char **argv) {
     cudaCheck(cudaMalloc(&d_rstd, B * T * sizeof(floatX)));
     cudaCheck(cudaMalloc(&d_scratch, cuda_num_SMs * (2 * C + 1) * sizeof(float)));
     // copy over the "inputs" to the backward call
-    cudaCheck(cudaMemcpy(d_dout, doutX, B * T * C * sizeof(floatX), cudaMemcpyHostToDevice));
-    cudaCheck(cudaMemcpy(d_inp, inpX, B * T * C * sizeof(floatX), cudaMemcpyHostToDevice));
-    cudaCheck(cudaMemcpy(d_weight, weightX, C * sizeof(floatX), cudaMemcpyHostToDevice));
-    cudaCheck(cudaMemcpy(d_mean, meanX, B * T * sizeof(floatX), cudaMemcpyHostToDevice));
-    cudaCheck(cudaMemcpy(d_rstd, rstdX, B * T * sizeof(floatX), cudaMemcpyHostToDevice));
-    // init the "outputs" of the backward call to zeros
-    cudaCheck(cudaMemset(d_dinp, 0, B * T * C * sizeof(floatX)));
-    cudaCheck(cudaMemset(d_dweight, 0, C * sizeof(floatX)));
-    cudaCheck(cudaMemset(d_dbias, 0, C * sizeof(floatX)));
+    cudaCheck(memcpy_convert(d_dout, dout, B * T * C));
+    cudaCheck(memcpy_convert(d_inp, inp, B * T * C));
+    cudaCheck(memcpy_convert(d_weight, weight, C));
+    cudaCheck(memcpy_convert(d_mean, mean, B * T));
+    cudaCheck(memcpy_convert(d_rstd, rstd, B * T));
 
     // launch the kernel
-    const int block_size = 256;
-    layernorm_backward(kernel_num, d_dinp, d_dweight, d_dbias, d_scratch, d_dout, d_inp, d_weight, d_mean, d_rstd, B, T, C, block_size);
-
-    // check the correctness of the kernel
-    float error_threshold_dinp = sizeof(floatX) == 4 ? 1e-3f : 1e-1f; // allow larger errors for BF16/FP16
-    float error_threshold_dparams = sizeof(floatX) == 4 ? 1e-3f : 20.0f; // much, much larger...
-    printf("Checking correctness...\n");
-    printf("dinp:\n");
-    validate_result(d_dinp, dinp, "dinp", B * T * C, error_threshold_dinp);
-    printf("dweight:\n");
-    validate_result(d_dweight, dweight, "dweight", C, error_threshold_dparams);
-    printf("dbias:\n");
-    validate_result(d_dbias, dbias, "dbias", C, error_threshold_dparams);
+    int block_sizes[] = {32, 64, 128, 256, 512, 768, 1024};
+    for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
+        int block_size = block_sizes[j];
+        // init the "outputs" of the backward call to zeros
+        cudaCheck(cudaMemset(d_dinp, 0, B * T * C * sizeof(floatX)));
+        cudaCheck(cudaMemset(d_dweight, 0, C * sizeof(floatX)));
+        cudaCheck(cudaMemset(d_dbias, 0, C * sizeof(floatX)));
+
+        layernorm_backward(kernel_num, d_dinp, d_dweight, d_dbias, d_scratch, d_dout, d_inp, d_weight, d_mean, d_rstd,
+                           B, T, C, block_size);
+
+        // check the correctness of the kernel
+        float error_threshold_dinp = sizeof(floatX) == 4 ? 1e-3f : 1e-1f; // allow larger errors for BF16/FP16
+        float error_threshold_dparams = sizeof(floatX) == 4 ? 1e-3f : 5e-1f; // much, much larger...
+        printf("Checking correctness...\n");
+        printf("dinp:\n");
+        validate_result(d_dinp, dinp, "dinp", B * T * C, error_threshold_dinp);
+        printf("dweight:\n");
+        validate_result(d_dweight, dweight, "dweight", C, error_threshold_dparams);
+        printf("dbias:\n");
+        validate_result(d_dbias, dbias, "dbias", C, error_threshold_dparams);
+
+        printf("All results match for block_size=%d.\n\n", block_size);
+    }
 
     // now time the kernel
-    int block_sizes[] = {32, 64, 128, 256, 512, 1024};
     for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
         int block_size = block_sizes[j];
         int repeat_times = 100;
@@ -1110,11 +1097,6 @@ int main(int argc, char **argv) {
     free(dinp);
     free(dweight);
     free(dbias);
-    free(meanX);
-    free(rstdX);
-    free(doutX);
-    free(inpX);
-    free(weightX);
     cudaCheck(cudaFree(d_dinp));
     cudaCheck(cudaFree(d_dweight));
     cudaCheck(cudaFree(d_dbias));

From 2d43e5bc977bac537ae1f6aff7de978fba328409 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Tue, 14 May 2024 19:14:07 +0000
Subject: [PATCH 070/172] remove legacy comment

---
 dev/cuda/common.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/dev/cuda/common.h b/dev/cuda/common.h
index 2757c67b5..f78e140a5 100644
--- a/dev/cuda/common.h
+++ b/dev/cuda/common.h
@@ -294,11 +294,6 @@ void validate_result(D* device_result, const T* cpu_reference, const char* name,
         exit(EXIT_FAILURE);
     }
 
-    // reset the result pointer, so we can chain multiple tests and don't miss trivial errors,
-    // like the kernel not writing to part of the result.
-    // cudaMemset(device_result, 0, num_elements * sizeof(T));
-    // AK: taking this out, ~2 hours of my life was spent finding this line
-
     free(out_gpu);
 }
 

From 3b5933ecfb9dca85e5663effdb662092aac11a7f Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Tue, 14 May 2024 22:32:24 +0300
Subject: [PATCH 071/172] considerably speed up CPU matmul while still keeping
 it relatively readable

---
 train_gpt2.c | 70 ++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 57 insertions(+), 13 deletions(-)

diff --git a/train_gpt2.c b/train_gpt2.c
index 9706a2c0b..06cdfbb54 100644
--- a/train_gpt2.c
+++ b/train_gpt2.c
@@ -158,32 +158,76 @@ void layernorm_backward(float* dinp, float* dweight, float* dbias,
     }
 }
 
+void matmul_forward_slow(float* out,
+                         const float* inp, const float* weight, const float* bias,
+                         int B, int T, int C, int OC) {
+    // basic implementation of matrix multiplication. This serves as a fallback
+    // for bad input shapes, and as an illustration for the most basic version
+    // of the algorithm.
+#pragma omp parallel for collapse(2)
+    for (int b = 0; b < B; b++) {
+        for (int t = 0; t < T; t++) {
+            int bt = b * T + t;
+            for (int o = 0; o < OC; o++) {
+                float val = (bias != NULL) ? bias[o] : 0.0f;
+                for (int i = 0; i < C; i++) {
+                    val += inp[bt * C + i] * weight[o*C + i];
+                }
+                out[bt * OC + o] = val;
+            }
+        }
+    }
+}
+
 void matmul_forward(float* out,
-                    float* inp, float* weight, float* bias,
+                    const float* inp, const float* weight, const float* bias,
                     int B, int T, int C, int OC) {
     // most of the running time is spent here and in matmul_backward
     // OC is short for "output channels"
     // inp is (B,T,C), weight is (OC, C), bias is (OC)
     // out will be (B,T,OC)
-    #pragma omp parallel for collapse(2)
-    for (int b = 0; b < B; b++) {
-        for (int t = 0; t < T; t++) {
-            float* out_bt = out + b * T * OC + t * OC;
-            float* inp_bt = inp + b * T * C + t * C;
-            for (int o = 0; o < OC; o++) {
-                float val = (bias != NULL) ? bias[o] : 0.0f;
-                float* wrow = weight + o*C;
-                for (int i = 0; i < C; i++) {
-                    val += inp_bt[i] * wrow[i];
+
+    // make sure the tiled loop will be correct, otherwise, fallback to slow version
+    const int LOOP_UNROLL = 8;
+    if (B*T % LOOP_UNROLL != 0) {
+        matmul_forward_slow(out, inp, weight, bias, B, T, C, OC);
+        return;
+    }
+
+    // collapse the B and T loops into one and turn it into a strided loop.
+    // then we can tile the inner loop, and reuse the loaded weight LOOP_UNROLL many times
+    // for significant speed-ups.
+    #pragma omp parallel for
+    for (int obt = 0; obt < B * T; obt += LOOP_UNROLL) {
+        for (int o = 0; o < OC; o++) {
+            // keep LOOP_UNROLL many results in register, initialized by the bias term.
+            float result[LOOP_UNROLL];
+            for (int ibt = 0; ibt < LOOP_UNROLL; ++ibt) {
+                result[ibt] = (bias != NULL) ? bias[o] : 0.0f;
+            }
+
+            // inner loops. Because we do LOOP_UNROLL steps of inner bt, we can cache
+            // the value of weight[i + o * C] and reuse it.
+            // we compile with -Ofast, so the compiler will turn the inner loop into a bunch of FMAs
+            for (int i = 0; i < C; i++) {
+                float w = weight[i + o * C];
+                for (int ibt = 0; ibt < LOOP_UNROLL; ++ibt) {
+                    int bt = obt + ibt;
+                    result[ibt] += inp[bt * C + i] * w;
                 }
-                out_bt[o] = val;
+            }
+
+            // write back results to main memory
+            for (int ibt = 0; ibt < LOOP_UNROLL; ++ibt) {
+                int bt = obt + ibt;
+                out[bt * OC + o] = result[ibt];
             }
         }
     }
 }
 
 void matmul_backward(float* dinp, float* dweight, float* dbias,
-                     float* dout, float* inp, float* weight,
+                     const float* dout, const float* inp, const float* weight,
                      int B, int T, int C, int OC) {
     // most of the running time is spent here and in matmul_forward
     // this backward could be done in a single "round" of loops

From b2a5508b84a0db561e371ef0092050e33c245a29 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Tue, 14 May 2024 22:48:59 +0300
Subject: [PATCH 072/172] constness fixes

---
 train_gpt2.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/train_gpt2.c b/train_gpt2.c
index 06cdfbb54..0c5583e5e 100644
--- a/train_gpt2.c
+++ b/train_gpt2.c
@@ -237,10 +237,10 @@ void matmul_backward(float* dinp, float* dweight, float* dbias,
     #pragma omp parallel for collapse(2)
     for (int b = 0; b < B; b++) {
         for (int t = 0; t < T; t++) {
-            float* dout_bt = dout + b * T * OC + t * OC;
+            const float* dout_bt = dout + b * T * OC + t * OC;
             float* dinp_bt = dinp + b * T * C + t * C;
             for (int o = 0; o < OC; o++) {
-                float* wrow = weight + o*C;
+                const float* wrow = weight + o*C;
                 float d = dout_bt[o];
                 for (int i = 0; i < C; i++) {
                     dinp_bt[i] += wrow[i] * d;
@@ -253,8 +253,8 @@ void matmul_backward(float* dinp, float* dweight, float* dbias,
     for (int o = 0; o < OC; o++) {
         for (int b = 0; b < B; b++) {
             for (int t = 0; t < T; t++) {
-                float* dout_bt = dout + b * T * OC + t * OC;
-                float* inp_bt = inp + b * T * C + t * C;
+                const float* dout_bt = dout + b * T * OC + t * OC;
+                const float* inp_bt = inp + b * T * C + t * C;
                 float* dwrow = dweight + o*C;
                 float d = dout_bt[o];
                 if (dbias != NULL) { dbias[o] += d; }

From 92fc26eba4549057b40726d03e36d483e40542da Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Tue, 14 May 2024 21:32:47 +0000
Subject: [PATCH 073/172] the nuts and bolts of gradient accumulation again,
 merged to master, but there is a bug and it doesn't work, debugging...

---
 train_gpt2.cu | 86 ++++++++++++++++++++++++++++++++++-----------------
 train_gpt2.py | 49 ++++++++++++++++++++++-------
 2 files changed, 96 insertions(+), 39 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 78922a60b..69df89084 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -1233,7 +1233,7 @@ __device__ SoftmaxParams prepare_softmax_blockwide3(int idx, const floatX* inp,
 template <bool WriteLogits = true, bool WriteProbs = false>
 __global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS)
                 fused_classifier_kernel5(floatX* logits, floatX* losses, floatX* probs,
-                                         const floatX* dlosses, const int* targets,
+                                         const float dloss, const int* targets,
                                          int B, int T, int V, int P) {
     int idx = gridDim.x - (blockIdx.x+1); // reverse order for cache hits on matmul data
     int ix = targets[idx];
@@ -1247,8 +1247,6 @@ __global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS)
         losses[idx] = (floatX)(-logf(prob));
     }
 
-    // very sensible default for dlosses is 1/(B*T), which is the uniform loss
-    float dloss = (dlosses != NULL) ? (float)dlosses[idx] : 1.0f / (B*T);
     // calculate the gradients directly, saves bandwidth from probs during training
     // but also supports writing probs for inference-only and debugging
     const floatX* logits_vec = logits + idx * P;
@@ -1307,11 +1305,11 @@ __device__ float cast_value<float, half>(half val) {
 template<>
 __device__ float cast_value<float, __nv_bfloat16>(__nv_bfloat16 val) {
     return __bfloat162float(val);
-} 
+}
 
 template<typename Td, typename Ts>
 __global__ void copy_and_cast_kernel(Td* dst, const Ts* src, size_t n) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x; 
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
     // need to try grid stride looping for more perf later
     if (idx < n) {
         dst[idx] = cast_value<Td, Ts>(src[idx]);
@@ -1647,13 +1645,13 @@ void attention_backward(floatX* dinp, floatX* dqkvr, floatX* dpreatt, floatX* da
 // replaces logits with logit gradients
 template <typename Type>
 void fused_classifier(Type* logits, Type* losses,
-                      const Type* dlosses, const int* targets,
+                      const float dloss, const int* targets,
                       int B, int T, int V, int P) {
     NVTX_RANGE_FN();
     const int block_size = 1024;
     const int N = B * T;
     const int grid_size = N;
-    fused_classifier_kernel5<<<grid_size, block_size, 512, main_stream>>>(logits, losses, (Type*)NULL, dlosses, targets, B, T, V, P);
+    fused_classifier_kernel5<<<grid_size, block_size, 512, main_stream>>>(logits, losses, (floatX*)NULL, dloss, targets, B, T, V, P);
     cudaCheck(cudaGetLastError());
 }
 
@@ -1987,7 +1985,7 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
     model->use_master_weights = 1; // keep master weights copy in float for optim update?
 }
 
-void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bool get_loss=true) {
+void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bool get_loss=true, int grad_accum_steps=1) {
     NVTX_RANGE_FN();
     // targets are optional and could be NULL
     // in this function we must be careful and use size_t instead of int, otherwise
@@ -2133,8 +2131,8 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo
         // wait on memcpy of targets (definitely finished by now, but better safe than sorry)
         cudaStreamWaitEvent(main_stream, parallel_events[0], 0);
         // fused classifier: does the forward pass and first part of the backward pass
-        // we're passing dlosses = NULL, which will default them to 1.0f/(B*T), i.e. uniform loss
-        fused_classifier(acts.output, model->cpu_losses, (floatX*)NULL, model->targets, B, T, V, Vp);
+        const float dloss = 1.0f / (B * T * grad_accum_steps); // results in the uniform average loss over all elements
+        fused_classifier(acts.output, model->cpu_losses, dloss, model->targets, B, T, V, Vp);
 
         // the GPU now writes the losses directly to the CPU buffer allocated with cudaMallocHost()
         // we accumulate cpu_losses at the end of gpt2_backward() waiting on this event
@@ -2149,9 +2147,10 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo
 
     // accumulate the loss immediately if we are not going to run gpt2_backward(), e.g. inference
     if (get_loss) {
+        assert(targets != NULL); // makes no sense to request loss if we don't have targets
         cudaCheck(cudaEventSynchronize(loss_event)); // hopefully finished long ago
         for (int i=0; i<B*T; i++) { model->mean_loss += (float)(model->cpu_losses[i]); }
-        model->mean_loss /= B*T;
+        model->mean_loss /= B*T*grad_accum_steps;
     }
 }
 
@@ -2624,8 +2623,9 @@ void error_usage() {
     fprintf(stderr, "  -i <string> input dataset prefix (default = data/tiny_shakespeare)\n");
     fprintf(stderr, "  -e <string> input model filename (default = gpt2_124M_bf16.bin)\n");
     fprintf(stderr, "  -o <string> output log file (default = NULL)\n");
-    fprintf(stderr, "  -b <int>    batch size B (default = 4)\n");
+    fprintf(stderr, "  -b <int>    (per-GPU, micro) batch size B (default = 4)\n");
     fprintf(stderr, "  -t <int>    sequence length T (default = 1024)\n");
+    fprintf(stderr, "  -d <int>    total desired batch size (default = B * T * num_processes, i.e. no grad accumulation\n");
     fprintf(stderr, "  -l <float>  learning rate (default = 3e-4f)\n");
     fprintf(stderr, "  -x <int>    max_steps of optimization to run (-1 (default) = disable, run 1 epoch)\n");
     fprintf(stderr, "  -v <int>    val_loss_every, how often we evaluate val loss (default = 20)\n");
@@ -2650,6 +2650,7 @@ int main(int argc, char *argv[]) {
     const char* output_log_file = NULL;
     int B = 4; // batch size
     int T = 1024; // sequence length max
+    int total_batch_size = -1; // will be calculated down below later, if not provided
     float learning_rate = 3e-4f;
     int val_loss_every = 20; // every how many steps do we eval validation loss?
     int val_max_batches = 20; // how many batches max do we eval for validation loss?
@@ -2668,8 +2669,9 @@ int main(int argc, char *argv[]) {
         if (argv[i][1] == 'i') { input_dataset_prefix = argv[i+1]; }
         else if (argv[i][1] == 'e') { load_filename = argv[i+1]; }
         else if (argv[i][1] == 'o') { output_log_file = argv[i+1]; }
-        else if (argv[i][1] == 'b') { B = atoi(argv[i+1]); } // Per-GPU batch size
+        else if (argv[i][1] == 'b') { B = atoi(argv[i+1]); } // Per-GPU (micro) batch size
         else if (argv[i][1] == 't') { T = atoi(argv[i+1]); }
+        else if (argv[i][1] == 'd') { total_batch_size = atoi(argv[i+1]); }
         else if (argv[i][1] == 'l') { learning_rate = atof(argv[i+1]); }
         else if (argv[i][1] == 'x') { max_steps = atoi(argv[i+1]); }
         else if (argv[i][1] == 'v') { val_loss_every = atoi(argv[i+1]); }
@@ -2679,16 +2681,19 @@ int main(int argc, char *argv[]) {
         else if (argv[i][1] == 'a') { overfit_single_batch = atoi(argv[i+1]); }
         else if (argv[i][1] == 'f') { override_enable_tf32 = atoi(argv[i+1]); }
         else if (argv[i][1] == 'w') { use_master_weights = atoi(argv[i+1]); }
-        else if (argv[i][1] == 'z') { zero_stage = atoi(argv[i+1]); }      
+        else if (argv[i][1] == 'z') { zero_stage = atoi(argv[i+1]); }
         else { error_usage(); }
     }
+    // calculate a sensible default for total batch size by assuming no gradient accumulation
+    if (total_batch_size == -1) { total_batch_size = B * T * multi_gpu_config.num_processes; }
     printf0("+-----------------------+----------------------------------------------------+\n");
     printf0("| Parameter             | Value                                              |\n");
     printf0("+-----------------------+----------------------------------------------------+\n");
     printf0("| input dataset prefix  | %-50s |\n", input_dataset_prefix);
     printf0("| output log file       | %-50s |\n", output_log_file == NULL ? "NULL" : output_log_file);
-    printf0("| batch size B          | %-50d |\n", B);
+    printf0("| micro batch size B    | %-50d |\n", B);
     printf0("| sequence length T     | %-50d |\n", T);
+    printf0("| total batch size      | %-50d |\n", total_batch_size);
     printf0("| learning rate         | %-50e |\n", learning_rate);
     printf0("| max_steps             | %-50d |\n", max_steps);
     printf0("| val_loss_every        | %-50d |\n", val_loss_every);
@@ -2747,9 +2752,17 @@ int main(int argc, char *argv[]) {
     printf0("+-----------------------+----------------------------------------------------+\n");
 
     // more prints related to allocations from gpt2_build_from_checkpoint down here to not mess up our table above
-    printf0("num_parameters: %zu ==> bytes: %zu\n", model.num_parameters, model.num_parameters_bytes);
+    printf0("num_parameters: %zu => bytes: %zu\n", model.num_parameters, model.num_parameters_bytes);
     printf0("allocated %d MiB for model parameters\n", (int)round(model.num_parameters_bytes / (1024 * 1024)));
 
+    // figure out gradient accumulation from the desired total batch size
+    int tokens_per_fwdbwd = B * T * multi_gpu_config.num_processes; // one micro-batch processes this many tokens
+    assert(total_batch_size % tokens_per_fwdbwd == 0);
+    int grad_accum_steps = total_batch_size / tokens_per_fwdbwd;
+    printf0("batch_size B=%d * seq_len T=%d * num_processes=%d and total_batch_size=%d\n",
+            B, T, multi_gpu_config.num_processes, total_batch_size);
+    printf0("=> setting grad_accum_steps=%d\n", grad_accum_steps);
+
     // set up the Logger & Tokenizer
     Logger logger;
     logger_init(&logger, output_log_file);
@@ -2841,30 +2854,47 @@ int main(int argc, char *argv[]) {
         // the validation/sampling one last time, and then we break right here as we're done.
         if (last_step) { break; }
 
-        // do a training step
+        // --------------- TRAINING SECTION BEGIN -----------------
+        // do one training step, doing forward/backward/update on total_batch_size tokens
         cudaEventRecord(start);
-        if (overfit_single_batch == 0 || (step == 0 && overfit_single_batch == 1)) {
-            // if we're overfitting a single batch, we'll only call this at step = 0
-            dataloader_next_batch(&train_loader);
+        // gradient accumulation loop over micro-batches
+        float lossf = 0.0f; // for getting the mean loss over the accumulation steps
+        for (int micro_step = 0; micro_step < grad_accum_steps; micro_step++) {
+            // fetch the next data batch
+            // and if we're overfitting a single batch, we'll only call this a single time
+            if (overfit_single_batch == 0 ||
+               (overfit_single_batch == 1 && step == 0 && micro_step == 0)) {
+                dataloader_next_batch(&train_loader);
+            }
+            // forward pass. note that we pass in grad_accum_steps, which scales down the loss
+            gpt2_forward(&model, train_loader.inputs, train_loader.targets, B, T, true, grad_accum_steps);
+            lossf += model.mean_loss; // the mean_loss was normalized by grad_accum_steps inside gpt2_forward
+            // backward pass. all model params accumulate gradients with += inside this inner loop
+            gpt2_backward(&model);
         }
-        gpt2_forward(&model, train_loader.inputs, train_loader.targets, B, T, false);
-        gpt2_zero_grad(&model);
-        gpt2_backward(&model);
-#ifndef MULTI_GPU        
+        // override the mean loss, accounting for the gradient accumulation loop
+        // this is esp important to do here in multigpu update below, where model.mean_loss gets allreduced
+        model.mean_loss = lossf;
+        // update the parameters
+#ifndef MULTI_GPU
         gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1);
 #else
         gpt2_multi_gpu_accumulate(&model, &multi_gpu_config);
         gpt2_multi_gpu_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1, &multi_gpu_config);
         gpt2_multi_gpu_gather(&model, &multi_gpu_config);
 #endif
+        // zero out the gradients for the next iteration
+        gpt2_zero_grad(&model);
+        cudaEventRecord(end);
+        cudaCheck(cudaEventSynchronize(end)); // wait for the end event to finish to get correct timings
+        // --------------- TRAINING SECTION END -------------------
+        // everything that follows now is just diagnostics, prints, logging, etc.
 
         // todo - move or double-buffer all of this timing logic to avoid idling the GPU at this point!
-        cudaEventRecord(end);
         float time_elapsed_ms;
-        cudaCheck(cudaEventSynchronize(end)); // wait for the end event to finish to get correct timings
         cudaCheck(cudaEventElapsedTime(&time_elapsed_ms, start, end));
-
-        float tokens_per_second = multi_gpu_config.num_processes * (B * T) / time_elapsed_ms * 1000.0;
+        size_t tokens_processed = (size_t)multi_gpu_config.num_processes * B * T * grad_accum_steps;
+        float tokens_per_second = tokens_processed / time_elapsed_ms * 1000.0;
         float bias_corrected_ema_tokens_per_second = tokens_per_second; // by default set to non-ema version
         if (step > 0) { // consider the first batch to be a warmup (e.g. cuBLAS/cuDNN initialisation)
             total_sum_iteration_time_s += time_elapsed_ms / 1000.0;
diff --git a/train_gpt2.py b/train_gpt2.py
index 80547b8f1..c50fc7b61 100644
--- a/train_gpt2.py
+++ b/train_gpt2.py
@@ -404,8 +404,9 @@ def print0(*args, **kwargs):
     parser.add_argument("--tensorcores", type=int, default=0, help="use tensorcores")
     parser.add_argument("--flash", type=int, default=0, help="use flash attention")
     parser.add_argument("--num_iterations", type=int, default=10, help="number of iterations to run")
-    parser.add_argument("--batch_size", type=int, default=4, help="batch size")
+    parser.add_argument("--batch_size", type=int, default=4, help="batch size, in units of #batch dimensions")
     parser.add_argument("--sequence_length", type=int, default=64, help="sequence length")
+    parser.add_argument("--total_batch_size", type=int, default=256, help="total desired batch size, in units of #tokens")
     args = parser.parse_args()
     B, T = args.batch_size, args.sequence_length
     assert 1 <= T <= 1024
@@ -443,6 +444,13 @@ def print0(*args, **kwargs):
                 device = "mps"
     print(f"using device: {device}")
 
+    # calculate gradient accumulation from the desired total batch size and the current run configuration
+    tokens_per_fwdbwd = B * T * ddp_world_size
+    assert args.total_batch_size % tokens_per_fwdbwd == 0
+    grad_accum_steps = args.total_batch_size // tokens_per_fwdbwd
+    print(f"total desired batch size: {args.total_batch_size}")
+    print(f"=> calculated gradient accumulation steps: {grad_accum_steps}")
+
     # set up a context manager following the desired dtype and device
     ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[args.dtype]
     ctx = torch.amp.autocast(device_type="cuda", dtype=ptdtype) if device == "cuda" else nullcontext()
@@ -544,14 +552,33 @@ def get_batch():
     if device == "cuda":
         torch.cuda.reset_peak_memory_stats()
     timings = []
-    for i in range(args.num_iterations):
+    for step in range(args.num_iterations):
         t0 = time.time()
-        with ctx:
-            _, loss = model(x, y, return_logits=False)
-        if not args.inference_only:
-            optimizer.zero_grad(set_to_none=True)
-            loss.backward()
-            optimizer.step()
+
+        # micro-batch loop where we do gradient accumulation to reach desired total batch size
+        lossf = 0.0 # for getting the mean loss (as simple float) over the accumulation steps
+        for micro_step in range(grad_accum_steps):
+            # forward pass
+            with ctx:
+                _, loss = model(x, y, return_logits=False)
+                # we have to scale the loss to account for gradient accumulation,
+                # because the gradients just add on each successive backward().
+                # addition of gradients corresponds to a SUM in the objective, but
+                # instead of a SUM we want MEAN, so we scale the loss here
+                loss = loss / grad_accum_steps
+                lossf += loss.item() # keep track of the mean loss
+            if ddp:
+                # we want only the last micro-step to sync grads in a DDP model
+                # the official way to do this is with model.no_sync(), but that is a
+                # context manager that bloats the code, so we just toggle this variable
+                model.require_backward_grad_sync = (micro_step == grad_accum_steps - 1)
+            # backward pass
+            if not args.inference_only:
+                loss.backward()
+        # todo: grad clip here
+        optimizer.step()
+        optimizer.zero_grad(set_to_none=True)
+
         # wait on the CPU for all device work to end so we get accurate per-iteration timings below
         if device == "mps":
             torch.mps.synchronize()
@@ -560,9 +587,9 @@ def get_batch():
         # time and print
         t1 = time.time()
         # the 0th iteration is often an outlier (much slower) => skip logging it
-        tokens_per_second = ddp_world_size * B * T / (t1-t0)
-        print0(f"iteration {i+1}, loss: {loss.item():.4f}, time: {(t1-t0)*1000:.3f}ms, tok/s: {tokens_per_second:.2f}")
-        if i > 0 and i > args.num_iterations - 20:
+        tokens_per_second = grad_accum_steps * ddp_world_size * B * T / (t1-t0)
+        print0(f"iteration {step+1}, loss: {lossf:.4f}, time: {(t1-t0)*1000:.3f}ms, tok/s: {tokens_per_second:.2f}")
+        if step > 0 and step > args.num_iterations - 20:
             timings.append(t1-t0)
 
     # print the average of the last 20 timings, to get something smooth-ish

From a4567ae940d527ea9e53b4f366517c472d16f09e Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Tue, 14 May 2024 22:13:54 +0000
Subject: [PATCH 074/172] delete parallels, still not fixed

---
 train_gpt2.cu | 48 +++++++-----------------------------------------
 1 file changed, 7 insertions(+), 41 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 69df89084..b5a2b1e5c 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -123,12 +123,7 @@ cublasHandle_t cublas_handle;
 cudaDeviceProp deviceProp;
 
 // CUDA streams & events (note: non-timing events, use separate events for timing/profiling!)
-constexpr int num_parallel_streams = 2; // + 1 primary "main_stream" (+ default stream)
-cudaStream_t parallel_streams[num_parallel_streams];
-cudaEvent_t parallel_events[num_parallel_streams];
 cudaStream_t main_stream;
-cudaEvent_t main_event;
-cudaEvent_t loss_event; // to make sure fused_classifier has written the losses to the CPU buffer
 
 // convenience macro for calculating grid/block dimensions for kernels
 #define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
@@ -1558,7 +1553,7 @@ void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias,
         if(grid_size_y == 1) {
             matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim, 0, main_stream>>>(dbias, dout, B, T, OC, std::bool_constant<false>{});
         } else {
-            cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float), main_stream);
+            cudaMemset(dbias_buffer, 0, OC * sizeof(float));
             matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim, 0, main_stream>>>(dbias_buffer, dout, B, T, OC, std::bool_constant<true>{});
             cast_and_add_kernel<<<CEIL_DIV(OC, 256), 256, 0, main_stream>>>(dbias, dbias_buffer, OC);
         }
@@ -1586,7 +1581,7 @@ void layernorm_backward(floatX* dinp, floatX* dweight, floatX* dbias, float* scr
     const int grid_size = blocks_per_sm * deviceProp.multiProcessorCount;
     size_t shared_mem_size = (2 * C + 1) * sizeof(float);
 
-    cudaMemsetAsync(scratch, 0, (2 * C + 1) * sizeof(float), main_stream);
+    cudaMemset(scratch, 0, (2 * C + 1) * sizeof(float));
     layernorm_backward_kernel8<<<grid_size, block_size, shared_mem_size, main_stream>>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C);
     cudaCheck(cudaGetLastError());
 }
@@ -2041,11 +2036,10 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo
 
     // copy inputs/targets to the model
     // todo - inputs is copied on default stream so this synchronises CPU/GPU for now
-    cudaCheck(cudaMemcpyAsync(model->inputs, inputs, B * T * sizeof(int), cudaMemcpyHostToDevice, 0));
+    cudaCheck(cudaMemcpy(model->inputs, inputs, B * T * sizeof(int), cudaMemcpyHostToDevice));
     if (targets != NULL) {
         // memcpy targets in parallel then wait for them before fused_classifier
-        cudaCheck(cudaMemcpyAsync(model->targets, targets, B * T * sizeof(int), cudaMemcpyHostToDevice, parallel_streams[0]));
-        cudaEventRecord(parallel_events[0], parallel_streams[0]);
+        cudaCheck(cudaMemcpy(model->targets, targets, B * T * sizeof(int), cudaMemcpyHostToDevice));
     }
 
     // forward pass
@@ -2128,16 +2122,9 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo
     // also forward the cross-entropy loss function if we have the targets
     if (targets != NULL) {
         NvtxRange classifier_and_loss_range("classifier_and_loss");
-        // wait on memcpy of targets (definitely finished by now, but better safe than sorry)
-        cudaStreamWaitEvent(main_stream, parallel_events[0], 0);
         // fused classifier: does the forward pass and first part of the backward pass
         const float dloss = 1.0f / (B * T * grad_accum_steps); // results in the uniform average loss over all elements
         fused_classifier(acts.output, model->cpu_losses, dloss, model->targets, B, T, V, Vp);
-
-        // the GPU now writes the losses directly to the CPU buffer allocated with cudaMallocHost()
-        // we accumulate cpu_losses at the end of gpt2_backward() waiting on this event
-        cudaEventRecord(loss_event, main_stream);
-
         // reset mean_loss here so gpt2_backward() knows we have targets
         model->mean_loss = 0.0f;
     } else {
@@ -2148,7 +2135,6 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo
     // accumulate the loss immediately if we are not going to run gpt2_backward(), e.g. inference
     if (get_loss) {
         assert(targets != NULL); // makes no sense to request loss if we don't have targets
-        cudaCheck(cudaEventSynchronize(loss_event)); // hopefully finished long ago
         for (int i=0; i<B*T; i++) { model->mean_loss += (float)(model->cpu_losses[i]); }
         model->mean_loss /= B*T*grad_accum_steps;
     }
@@ -2157,11 +2143,8 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo
 void gpt2_zero_grad(GPT2 *model) {
     NVTX_RANGE_FN();
     if (model->grads_memory != NULL) {
-        cudaCheck(cudaMemsetAsync(model->grads_memory, 0, model->num_parameters * sizeof(floatX), parallel_streams[0]));
+        cudaCheck(cudaMemset(model->grads_memory, 0, model->num_parameters * sizeof(floatX)));
     }
-    // Allow this to run in parallel with forward pass, but create a dependency with everything after (backwards pass)
-    cudaEventRecord(parallel_events[0], parallel_streams[0]);
-    cudaStreamWaitEvent(main_stream, parallel_events[0], 0);
 }
 
 void gpt2_backward(GPT2 *model) {
@@ -2207,10 +2190,7 @@ void gpt2_backward(GPT2 *model) {
     GradActTensors grads_acts = model->grads_acts;
 
     // reset residual stream gradients (put here to work with gradient accumulation)
-    cudaCheck(cudaMemsetAsync(model->grads_acts.residual3, 0, B * T * C * sizeof(floatX), parallel_streams[0]));
-    // allow the memset to run in parallel with the forward pass, but create a dependency with everything after
-    cudaEventRecord(parallel_events[0], parallel_streams[0]);
-    cudaStreamWaitEvent(main_stream, parallel_events[0], 0);
+    cudaCheck(cudaMemset(model->grads_acts.residual3, 0, B * T * C * sizeof(floatX)));
 
     // re-use the output buffer of the forward pass as a scratchpad during backward pass
     float* scratchF = (float*)acts.output;
@@ -2302,7 +2282,6 @@ void gpt2_backward(GPT2 *model) {
     encoder_backward(grads.wte, grads.wpe, dresidual, model->inputs, B, T, C, random_u32(&model->rng_state));
 
     // accumulate the loss, this was calculated at the end of gpt2_forward()
-    cudaCheck(cudaEventSynchronize(loss_event)); // hopefully finished long ago
     for (int i=0; i<B*T; i++) { model->mean_loss += (float)(model->cpu_losses[i]); }
     model->mean_loss /= B*T;
 }
@@ -2439,12 +2418,6 @@ void common_start(bool override_enable_tf32 = true, bool print_device_info = tru
     }
 
     cudaCheck(cudaStreamCreate(&main_stream));
-    cudaEventCreateWithFlags(&main_event, cudaEventDisableTiming);
-    cudaEventCreateWithFlags(&loss_event, cudaEventDisableTiming);
-    for (int i = 0; i < num_parallel_streams; i++) {
-        cudaCheck(cudaStreamCreate(&parallel_streams[i]));
-        cudaEventCreateWithFlags(&parallel_events[i], cudaEventDisableTiming);
-    }
 
     // set up cuBLAS and cuBLASLt (and cuDNN if enabled)
     cublasCheck(cublasCreate(&cublas_handle));
@@ -2463,14 +2436,7 @@ void common_start(bool override_enable_tf32 = true, bool print_device_info = tru
 }
 
 void common_free(GPT2 &model) {
-    cudaCheck(cudaEventDestroy(main_event));
-    cudaCheck(cudaEventDestroy(loss_event));
-    for (int i = 0; i < num_parallel_streams; i++) {
-        cudaCheck(cudaStreamDestroy(parallel_streams[i]));
-        cudaCheck(cudaEventDestroy(parallel_events[i]));
-    }
     cudaCheck(cudaStreamDestroy(main_stream));
-
     gpt2_free(&model);
     cudaCheck(cudaFree(cublaslt_workspace));
     cublasCheck(cublasDestroy(cublas_handle));
@@ -2819,7 +2785,7 @@ int main(int argc, char *argv[]) {
                 // we re-calculate the forward pass for all of (B,T) positions from scratch
                 // but the inference here is just for sanity checking anyway
                 // and we can maybe optimize a bit more later, with careful tests
-                gpt2_forward(&model, gen_tokens, NULL, B, T);
+                gpt2_forward(&model, gen_tokens, NULL, B, T, false);
                 // furthermore, below we're only using b=0 (i.e. the first row) of all B rows
                 // we're in principle running B "inference streams" in parallel here
                 // only using position 0 because it's a bit faster (copy less probs from GPU -> CPU)

From 3c3c965840b239480c87d3dfe04a70bf70986164 Mon Sep 17 00:00:00 2001
From: Azret Botash <azret.botash@gmail.com>
Date: Tue, 14 May 2024 17:16:24 -0700
Subject: [PATCH 075/172] Adding Mersenne Twisters C

---
 rand.h | 141 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 141 insertions(+)
 create mode 100644 rand.h

diff --git a/rand.h b/rand.h
new file mode 100644
index 000000000..f69340d20
--- /dev/null
+++ b/rand.h
@@ -0,0 +1,141 @@
+#ifndef RAND_H
+#define RAND_H
+
+#include <math.h>
+
+#define MERSENNE_STATE_M 397u
+#define MERSENNE_STATE_N 624u
+
+#define LMASK 0x7ffffffful
+#define UMASK 0x80000000ul
+
+// Copyright(c) Makoto Matsumoto and Takuji Nishimura
+
+// This implementation follows PyTorch so that we are numerically identical when running verification tests.
+
+typedef struct {
+    unsigned long long seed_;
+    int left_;
+    unsigned int next_;
+    unsigned int state_[MERSENNE_STATE_N];
+    unsigned int MATRIX_A[2];
+} mt19937_state;
+
+void manual_seed(mt19937_state* state, unsigned int seed) {
+    state->MATRIX_A[0] = 0x0u;
+    state->MATRIX_A[1] = 0x9908b0df;
+    state->state_[0] = seed & 0xffffffff;
+    for (unsigned int j = 1; j < MERSENNE_STATE_N; j++) {
+        state->state_[j] = 1812433253 * (state->state_[j - 1] ^ (state->state_[j - 1] >> 30)) + j;
+        state->state_[j] &= 0xffffffff;
+    }
+    state->left_ = 1;
+    state->next_ = 0;
+}
+
+void next_state(mt19937_state* state) {
+    state->left_ = MERSENNE_STATE_N;
+    state->next_ = 0;
+    unsigned int y, j;
+    for (j = 0; j < MERSENNE_STATE_N - MERSENNE_STATE_M; j++) {
+        y = (state->state_[j] & UMASK) | (state->state_[j + 1] & LMASK);
+        state->state_[j] = state->state_[j + MERSENNE_STATE_M] ^ (y >> 1) ^ state->MATRIX_A[y & 0x1];
+    }
+    for (; j < MERSENNE_STATE_N - 1; j++) {
+        y = (state->state_[j] & UMASK) | (state->state_[j + 1] & LMASK);
+        state->state_[j] = state->state_[j + (MERSENNE_STATE_M - MERSENNE_STATE_N)] ^ (y >> 1) ^ state->MATRIX_A[y & 0x1];
+    }
+    y = (state->state_[MERSENNE_STATE_N - 1] & UMASK) | (state->state_[0] & LMASK);
+    state->state_[MERSENNE_STATE_N - 1] = state->state_[MERSENNE_STATE_M - 1] ^ (y >> 1) ^ state->MATRIX_A[y & 0x1];
+}
+
+unsigned int randint32(mt19937_state* state) {
+    if (!state) return 0;
+    if (state->MATRIX_A[0] != 0 || state->MATRIX_A[1] != 0x9908b0df) manual_seed(state, 5489); // auto-initialize
+    if (--state->left_ <= 0) {
+        next_state(state);
+    }
+    unsigned int y = state->state_[state->next_++];
+    y ^= y >> 11;
+    y ^= (y << 7) & 0x9d2c5680;
+    y ^= (y << 15) & 0xefc60000;
+    y ^= y >> 18;
+    return y;
+}
+
+inline unsigned long long randint64(mt19937_state* state) {
+    return (((unsigned long long)(randint32(state)) << 32) | randint32(state));
+}
+
+inline float randfloat32(mt19937_state* state) {
+    return (randint32(state) & ((1ull << 24) - 1)) * (1.0f / (1ull << 24));
+}
+
+inline double randfloat64(mt19937_state* state) {
+    return (randint64(state) & ((1ull << 53) - 1)) * (1.0 / (1ull << 53));
+}
+
+void uniform_(float* data, unsigned int numel, float from, float to, mt19937_state* state) {
+    for (unsigned int t = 0; t < numel; t++) {
+        data[t] = randfloat32(state) * (to - from) + from;
+    }
+}
+
+// Box�Muller transform
+
+void normal_fill_16(float* data, float mean, float std, mt19937_state* state) {
+    #define EPSILONE 1e-12
+    for (unsigned int t = 0; t < 8; t++) {
+        float u1 = 1 - data[t];
+        float u2 = data[t + 8];
+        float radius = sqrtf(-2 * logf(u1 + EPSILONE));
+        float theta = 2.0 * M_PI * u2;
+        data[t] = (radius * cosf(theta) * std + mean);
+        data[t + 8] = (radius * sinf(theta) * std + mean);
+    }
+}
+
+void normal_fill(float* data, unsigned int numel, float mean, float std, mt19937_state* state) {
+    for (unsigned int t = 0; t < numel; t++) {
+        data[t] = randfloat32(state);
+    }
+    for (unsigned int i = 0; i < numel - 15; i += 16) {
+        normal_fill_16(data + i, mean, std, state);
+    }
+    if (numel % 16 != 0) {
+        // recompute the last 16 values
+        data = data + numel - 16;
+        for (unsigned int i = 0; i < 16; i++) {
+            data[i] = randfloat32(state);
+        }
+        normal_fill_16(data, mean, std, state);
+    }
+}
+
+void normal_(float* data, unsigned int numel, float mean, float std, mt19937_state* state) {
+    #define EPSILONE 1e-12
+    if (numel >= 16) {
+        normal_fill(data, numel, mean, std, state);
+    }
+    else {
+        double next_double_normal_sample;
+        int has_next_double_normal_sample = 0;
+        for (unsigned int  t = 0; t < numel; t++) {
+            if (has_next_double_normal_sample) {
+                data[t] = (float)(next_double_normal_sample * std + mean);
+                has_next_double_normal_sample = 0;
+                continue;
+            }
+            // for numel < 16 we draw a double (float64)
+            float u1 = randfloat64(state);
+            float u2 = randfloat64(state);
+            float radius = sqrtf(-2 * logf(1 - u2 + EPSILONE));
+            float theta = 2.0 * M_PI * u1;
+            next_double_normal_sample = radius * sinf(theta);
+            has_next_double_normal_sample = 1;
+            data[t] = (radius * cosf(theta) * std + mean);
+        }
+    }
+}
+
+#endif
\ No newline at end of file

From 7de50af5115bff97980c9acb1a17f7fd875793aa Mon Sep 17 00:00:00 2001
From: Azret Botash <azret.botash@gmail.com>
Date: Tue, 14 May 2024 17:38:09 -0700
Subject: [PATCH 076/172] train_gpt.c: Removing the hardcoded GPT2_EOT

---
 train_gpt2.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/train_gpt2.c b/train_gpt2.c
index 9706a2c0b..dbe4ca502 100644
--- a/train_gpt2.c
+++ b/train_gpt2.c
@@ -1067,9 +1067,6 @@ void dataloader_free(DataLoader *loader) {
 // ----------------------------------------------------------------------------
 // sampler
 
-// the GPT-2 end-of-text token id
-#define GPT2_EOT 50256
-
 unsigned int random_u32(unsigned long long *state) {
     // xorshift rng: https://en.wikipedia.org/wiki/Xorshift#xorshift.2A
     *state ^= *state >> 12;
@@ -1149,7 +1146,7 @@ int main() {
         if (step > 0 && step % 20 == 0) {
             // fill up gen_tokens with the GPT2_EOT, which kicks off the generation
             for(int i = 0; i < B * T; ++i) {
-                gen_tokens[i] = GPT2_EOT;
+                gen_tokens[i] = tokenizer.eot;
             }
             // now sample from the model autoregressively
             printf("generating:\n---\n");

From 16f9dad3011a0932e39f5762ec054863023455dd Mon Sep 17 00:00:00 2001
From: Azret Botash <azret.botash@gmail.com>
Date: Tue, 14 May 2024 17:45:08 -0700
Subject: [PATCH 077/172] Update train_gpt2.c

---
 train_gpt2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train_gpt2.c b/train_gpt2.c
index dbe4ca502..95c46ab86 100644
--- a/train_gpt2.c
+++ b/train_gpt2.c
@@ -1146,7 +1146,7 @@ int main() {
         if (step > 0 && step % 20 == 0) {
             // fill up gen_tokens with the GPT2_EOT, which kicks off the generation
             for(int i = 0; i < B * T; ++i) {
-                gen_tokens[i] = tokenizer.eot;
+                gen_tokens[i] = tokenizer.eot_token;
             }
             // now sample from the model autoregressively
             printf("generating:\n---\n");

From 160b3bd007336bb5866deed7adf60cd9a6cf1a1f Mon Sep 17 00:00:00 2001
From: Azret Botash <azret.botash@gmail.com>
Date: Tue, 14 May 2024 18:22:39 -0700
Subject: [PATCH 078/172] Setting up dev/CPU area with the first
 matmul_forward.c

---
 dev/cpu/matmul_forward.c | 217 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 217 insertions(+)
 create mode 100644 dev/cpu/matmul_forward.c

diff --git a/dev/cpu/matmul_forward.c b/dev/cpu/matmul_forward.c
new file mode 100644
index 000000000..f7b714326
--- /dev/null
+++ b/dev/cpu/matmul_forward.c
@@ -0,0 +1,217 @@
+/*
+CPU Kernels for matmul forward pass.
+*/
+
+// Compile Examples:
+//
+//      MSVC: cl.exe /O2 /fp:fast /Qvec-report:2 /I. /I ..\..\dev matmul_forward.c
+//            cl.exe /O2 /fp:fast /Qvec-report:2 /arch:AVX /I. /I ..\..\dev matmul_forward.c
+//            cl.exe /O2 /fp:fast /Qvec-report:2 /arch:AVX2 /I. /I ..\..\dev matmul_forward.c
+//
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <unistd.h>
+
+// ----------------------------------------------------------------------------
+// CPU code reference
+
+void matmul_forward_cpu(float* out,
+                    const float* inp, const float* weight, const float* bias,
+                    int B, int T, int C, int OC) {
+    // OC is short for "output channels"
+    // inp is (B,T,C), weight is (OC, C), bias is (OC)
+    // out will be (B,T,OC)
+    for (int b = 0; b < B; b++) {
+        for (int t = 0; t < T; t++) {
+            float* out_bt = out + b * T * OC + t * OC;
+            const float* inp_bt = inp + b * T * C + t * C;
+            for (int o = 0; o < OC; o++) {
+                float val = (bias != NULL) ? bias[o] : 0.0f;
+                const float* wrow = weight + o*C;
+                for (int i = 0; i < C; i++) {
+                    val += inp_bt[i] * wrow[i];
+                }
+                out_bt[o] = val;
+            }
+        }
+    }
+}
+
+void matmul_forward_ngc92(float* out,
+    const float* inp, const float* weight, const float* bias,
+    int B, int T, int C, int OC) {
+    // most of the running time is spent here and in matmul_backward
+    // OC is short for "output channels"
+    // inp is (B,T,C), weight is (OC, C), bias is (OC)
+    // out will be (B,T,OC)
+
+    // make sure the tiled loop will be correct, otherwise, fallback to slow version
+    #define LOOP_UNROLL 8
+
+    if (B * T % LOOP_UNROLL != 0) {
+        printf("MUST BE A MULTIPLE OF 8"); // FIXME
+        return;
+    }
+
+    // collapse the B and T loops into one and turn it into a strided loop.
+    // then we can tile the inner loop, and reuse the loaded weight LOOP_UNROLL many times
+    // for significant speed-ups.
+    for (int obt = 0; obt < B * T; obt += LOOP_UNROLL) {
+        for (int o = 0; o < OC; o++) {
+            // keep LOOP_UNROLL many results in register, initialized by the bias term.
+            float result[LOOP_UNROLL];
+            for (int ibt = 0; ibt < LOOP_UNROLL; ++ibt) {
+                result[ibt] = (bias != NULL) ? bias[o] : 0.0f;
+            }
+
+            // inner loops. Because we do LOOP_UNROLL steps of inner bt, we can cache
+            // the value of weight[i + o * C] and reuse it.
+            // we compile with -Ofast, so the compiler will turn the inner loop into a bunch of FMAs
+            for (int i = 0; i < C; i++) {
+                float w = weight[i + o * C];
+                for (int ibt = 0; ibt < LOOP_UNROLL; ++ibt) {
+                    int bt = obt + ibt;
+                    result[ibt] += inp[bt * C + i] * w;
+                }
+            }
+
+            // write back results to main memory
+            for (int ibt = 0; ibt < LOOP_UNROLL; ++ibt) {
+                int bt = obt + ibt;
+                out[bt * OC + o] = result[ibt];
+            }
+        }
+    }
+}
+
+#define NUM_KERNELS 2
+
+void matmul_forward(int kernel_num,
+    float* out,
+    const float* inp, const float* weight, const float* bias,
+    int B, int T, int C, int OC) {
+
+    switch (kernel_num) {
+        case 0:
+            matmul_forward_cpu(out, inp, weight, bias, B, T, C, OC);
+            break;
+        case 1:
+            matmul_forward_ngc92(out, inp, weight, bias, B, T, C, OC);
+            break;
+        default:
+            printf("Invalid kernel number\n");
+            exit(1);
+    }
+}
+
+
+void validate_results_cpu(const float* device_result, const float* cpu_reference, const char* name, int num_elements, float tolerance);
+float* make_random_float(size_t N);
+
+int main(int argc, char **argv) {
+    srand(0);
+
+    int B = 8;
+    int T = 1024;
+    int C = 768;
+    int OC = 768 * 4; // expansion of 4, e.g. in the MLP
+    int RUNS = 4; // number of times to run a kernel for benchmarks
+
+    srand(137);
+
+    float* out = make_random_float(B * T * OC);
+    float* inp = make_random_float(B * T * C);
+    float* weight = make_random_float(OC * C);
+    float* bias = make_random_float(OC);
+
+    float* grad_out = make_random_float(B * T * OC);
+    float* grad_inp = make_random_float(B * T * C);
+    float* grad_weight = make_random_float(OC * C);
+    float* grad_bias = make_random_float(OC);
+
+    printf("> Calculating reference\n");
+    matmul_forward_cpu(out, inp, weight, bias, B, T, C, OC);
+
+    for (int kernel_num = 0; kernel_num < NUM_KERNELS; kernel_num++) {
+        printf("> Verifying kernel #%d\n", kernel_num);
+
+        srand(137);
+
+        float* kernel_out = make_random_float(B * T * OC);
+        float* kernel_inp = make_random_float(B * T * C);
+        float* kernel_weight = make_random_float(OC * C);
+        float* kernel_bias = make_random_float(OC);
+
+        matmul_forward(kernel_num, kernel_out, kernel_inp, kernel_weight, kernel_bias, B, T, C, OC);
+
+        validate_results_cpu(kernel_out, out, "out", B * T * OC, 1e-5);
+
+        free(kernel_out);
+        free(kernel_inp);
+        free(kernel_weight);
+        free(kernel_bias);
+    }
+
+    printf("All kernels passed! Starting benchmarks.\n\n");
+
+    for (int kernel_num = 0; kernel_num < NUM_KERNELS; kernel_num++) {
+        printf("> Running kernel #%d\n", kernel_num);
+        struct timespec start, end;
+        clock_gettime(CLOCK_MONOTONIC, &start);
+
+        for (int i = 0; i < RUNS; i++) {
+            matmul_forward(kernel_num, out, inp, weight, bias, B, T, C, OC);
+        }
+
+        clock_gettime(CLOCK_MONOTONIC, &end);
+        double time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
+        printf("> Kernel #%d, (took %f ms)\n", kernel_num, time_elapsed_s * 1000);
+    }
+
+    // free memory
+    free(out);
+    free(inp);
+    free(weight);
+    free(bias);
+
+    free(grad_out);
+    free(grad_inp);
+    free(grad_weight);
+    free(grad_bias);
+
+    return 0;
+}
+
+float* make_random_float(size_t N) {
+    float* arr = (float*)malloc(N * sizeof(float));
+    for (size_t i = 0; i < N; i++) {
+        arr[i] = ((float)rand() / RAND_MAX) * 2.0 - 1.0; // range -1..1
+    }
+    return arr;
+}
+
+void validate_results_cpu(const float* kernel_result, const float* cpu_reference, const char* name, int num_elements, float tolerance) {
+    int nfaults = 0;
+    for (int i = 0; i < num_elements; i++) {
+        // print the first few comparisons
+        if (i < 5) {
+            printf("%f %f\n", cpu_reference[i], kernel_result[i]);
+        }
+        float t_eff = tolerance + fabs(cpu_reference[i]);
+        // ensure correctness for all elements.
+        if (fabs(cpu_reference[i] - kernel_result[i]) > t_eff) {
+            printf("Mismatch of %s at %d: CPU_ref: %f vs CPU_new: %f\n", name, i, cpu_reference[i], kernel_result[i]);
+            nfaults++;
+            if (nfaults >= 10) {
+                exit(EXIT_FAILURE);
+            }
+        }
+    }
+    if (nfaults > 0) {
+        exit(EXIT_FAILURE);
+    }
+    printf("OK\n");
+}
\ No newline at end of file

From 8eb3a432d991bf23bed3fae9f92dd395015ea457 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Wed, 15 May 2024 21:09:46 +0000
Subject: [PATCH 079/172] revert all streams and synchronization events, we'll
 bring them back but only one at a time and very very carefully, because they
 cause really subtle bugs and issues

---
 train_gpt2.cu | 96 ++++++++++++++++++++++-----------------------------
 1 file changed, 41 insertions(+), 55 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index b5a2b1e5c..b53f7712c 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -122,9 +122,6 @@ cublasLtHandle_t cublaslt_handle;
 cublasHandle_t cublas_handle;
 cudaDeviceProp deviceProp;
 
-// CUDA streams & events (note: non-timing events, use separate events for timing/profiling!)
-cudaStream_t main_stream;
-
 // convenience macro for calculating grid/block dimensions for kernels
 #define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
 
@@ -1327,7 +1324,7 @@ void encoder_forward(floatX* out,
     const int block_size = 256;
     const int N = B * T * C;
     const int grid_size = CEIL_DIV(N, (int)(block_size * x128::size));
-    encoder_forward_kernel3<<<grid_size, block_size, 0, main_stream>>>(out, inp, wte, wpe, B, T, C);
+    encoder_forward_kernel3<<<grid_size, block_size>>>(out, inp, wte, wpe, B, T, C);
     cudaCheck(cudaGetLastError());
 }
 
@@ -1338,7 +1335,7 @@ void encoder_backward(floatX* dwte, floatX* dwpe,
     const int N = B * T * C;
     const int block_size = 256;
     const int grid_size = CEIL_DIV(N, block_size * 2); // each thread handles 2 elements
-    encoder_backward_kernel<<<grid_size, block_size, 0, main_stream>>>(dwte, dwpe, dout, inp, B, T, C, seed);
+    encoder_backward_kernel<<<grid_size, block_size>>>(dwte, dwpe, dout, inp, B, T, C, seed);
     cudaCheck(cudaGetLastError());
 }
 
@@ -1349,7 +1346,7 @@ void layernorm_forward(floatX* out, floatX* mean, floatX* rstd,
     const int block_size = 512;
     const int N = B * T;
     const int grid_size = CEIL_DIV(N * 32, block_size);
-    layernorm_forward_kernel3<<<grid_size, block_size, 0, main_stream>>>(out, mean, rstd, inp, weight, bias, N, C);
+    layernorm_forward_kernel3<<<grid_size, block_size>>>(out, mean, rstd, inp, weight, bias, N, C);
     cudaCheck(cudaGetLastError());
 }
 
@@ -1413,7 +1410,7 @@ void matmul_forward_cublaslt(floatX* out,
     cublasCheck(cublasLtMatmul(cublaslt_handle, operationDesc,
         &alpha, weight, weightLayout, inp, inputLayout, &beta,
         out, outputLayout, out, outputLayout, &heuristic.algo,
-        cublaslt_workspace, cublaslt_workspace_size, main_stream));
+        cublaslt_workspace, cublaslt_workspace_size, 0));
 
     // cleanups
     cublasCheck(cublasLtMatmulPreferenceDestroy(preference));
@@ -1445,7 +1442,7 @@ void attention_forward(floatX* out, floatX* qkvr, floatX* att,
     v = qkvr + 2 * B * T * C;
     int total_threads = B * NH * T * HS;
     int num_blocks = CEIL_DIV(total_threads, block_size);
-    permute_kernel<<<num_blocks, block_size, 0, main_stream>>>(q, k, v, inp, B, T, NH, HS);
+    permute_kernel<<<num_blocks, block_size>>>(q, k, v, inp, B, T, NH, HS);
 
 
     floatX* preatt = inp;
@@ -1460,7 +1457,7 @@ void attention_forward(floatX* out, floatX* qkvr, floatX* att,
     // multiply all elements of preatt elementwise by scale
     float scale = 1.0 / sqrtf(HS);
     int grid_size = CEIL_DIV(B * NH * T * 32, block_size);
-    softmax_forward_kernel5<<<grid_size, block_size, 0, main_stream>>>(att, scale, preatt, B * NH, T);
+    softmax_forward_kernel5<<<grid_size, block_size>>>(att, scale, preatt, B * NH, T);
 
     // new approach: first cuBLAS another batched matmul
     floatX* vaccum = inp;
@@ -1476,7 +1473,7 @@ void attention_forward(floatX* out, floatX* qkvr, floatX* att,
     // now unpermute
     // y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
     num_blocks = CEIL_DIV(B * T * C, block_size);
-    unpermute_kernel<<<num_blocks, block_size, 0, main_stream>>>(vaccum, out, B, T, NH, HS);
+    unpermute_kernel<<<num_blocks, block_size>>>(vaccum, out, B, T, NH, HS);
     cudaCheck(cudaGetLastError());
 }
 
@@ -1484,7 +1481,7 @@ void residual_forward(floatX* out, const floatX* inp1, const floatX* inp2, int N
     NVTX_RANGE_FN();
     const int block_size = 256;
     const int grid_size = CEIL_DIV(N, block_size * x128::size);
-    residual_forward_kernel<<<grid_size, block_size, 0, main_stream>>>(out, inp1, inp2, N);
+    residual_forward_kernel<<<grid_size, block_size>>>(out, inp1, inp2, N);
     cudaCheck(cudaGetLastError());
 }
 
@@ -1517,7 +1514,7 @@ void gelu_forward(floatX* out, const floatX* inp, int N) {
     NVTX_RANGE_FN();
     const int block_size = 512;
     const int grid_size = CEIL_DIV(N, block_size * x128::size);
-    gelu_forward_kernel2<<<grid_size, block_size, 0, main_stream>>>(out, inp, N);
+    gelu_forward_kernel2<<<grid_size, block_size>>>(out, inp, N);
     cudaCheck(cudaGetLastError());
 }
 
@@ -1525,7 +1522,7 @@ void gelu_backward(floatX* dinp, const floatX* inp, const floatX* dout, const in
     NVTX_RANGE_FN();
     const int block_size = 128;
     const int grid_size = CEIL_DIV(N, block_size * x128::size);
-    gelu_backward_kernel<<<grid_size, block_size, 0, main_stream>>>(dinp, inp, dout, N);
+    gelu_backward_kernel<<<grid_size, block_size>>>(dinp, inp, dout, N);
     cudaCheck(cudaGetLastError());
 }
 
@@ -1551,11 +1548,11 @@ void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias,
         // If we have enough OC that we don't need cross-block reductions, we can skip the bias_buffer accumulation
         // and write results directly to the output.
         if(grid_size_y == 1) {
-            matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim, 0, main_stream>>>(dbias, dout, B, T, OC, std::bool_constant<false>{});
+            matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias, dout, B, T, OC, std::bool_constant<false>{});
         } else {
             cudaMemset(dbias_buffer, 0, OC * sizeof(float));
-            matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim, 0, main_stream>>>(dbias_buffer, dout, B, T, OC, std::bool_constant<true>{});
-            cast_and_add_kernel<<<CEIL_DIV(OC, 256), 256, 0, main_stream>>>(dbias, dbias_buffer, OC);
+            matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias_buffer, dout, B, T, OC, std::bool_constant<true>{});
+            cast_and_add_kernel<<<CEIL_DIV(OC, 256), 256>>>(dbias, dbias_buffer, OC);
         }
     }
 
@@ -1582,7 +1579,7 @@ void layernorm_backward(floatX* dinp, floatX* dweight, floatX* dbias, float* scr
     size_t shared_mem_size = (2 * C + 1) * sizeof(float);
 
     cudaMemset(scratch, 0, (2 * C + 1) * sizeof(float));
-    layernorm_backward_kernel8<<<grid_size, block_size, shared_mem_size, main_stream>>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C);
+    layernorm_backward_kernel8<<<grid_size, block_size, shared_mem_size>>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C);
     cudaCheck(cudaGetLastError());
 }
 
@@ -1610,7 +1607,7 @@ void attention_backward(floatX* dinp, floatX* dqkvr, floatX* dpreatt, floatX* da
 
     // backward through the unpermute operation
     int num_blocks = CEIL_DIV(B * T * C, block_size);
-    unpermute_kernel_backward<<<num_blocks, block_size, 0, main_stream>>>(scratch, dout, B, T, NH, HS);
+    unpermute_kernel_backward<<<num_blocks, block_size>>>(scratch, dout, B, T, NH, HS);
     // backward into datt
     cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, T, T, HS, &alpha,
                                            v, CUBLAS_LOWP, HS, T * HS, scratch, CUBLAS_LOWP, HS, T * HS, &beta,
@@ -1622,7 +1619,7 @@ void attention_backward(floatX* dinp, floatX* dqkvr, floatX* dpreatt, floatX* da
     // backward into preatt
     int hs = C / NH; // head size
     float scale = 1.0f / sqrtf(hs);
-    softmax_autoregressive_backward_kernel<<<dim3(T / 4, B * NH), 256, 256, main_stream>>>(dpreatt, datt, att, B, T, C, scale);
+    softmax_autoregressive_backward_kernel<<<dim3(T / 4, B * NH), 256, 256>>>(dpreatt, datt, att, B, T, C, scale);
     // backward into q
     cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, HS, T, T, &alpha,
                                            k, CUBLAS_LOWP, HS, T * HS, dpreatt, CUBLAS_LOWP, T, T * T, &beta,
@@ -1633,7 +1630,7 @@ void attention_backward(floatX* dinp, floatX* dqkvr, floatX* dpreatt, floatX* da
                                            dk, CUBLAS_LOWP, HS, T * HS, B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT));
     // backward into inp
     num_blocks = CEIL_DIV(B * NH * T * HS, block_size);
-    permute_kernel_backward<<<num_blocks, block_size, 0, main_stream>>>(dinp, dq, dk, dv, B, T, NH, HS);
+    permute_kernel_backward<<<num_blocks, block_size>>>(dinp, dq, dk, dv, B, T, NH, HS);
     cudaCheck(cudaGetLastError());
 }
 
@@ -1646,7 +1643,7 @@ void fused_classifier(Type* logits, Type* losses,
     const int block_size = 1024;
     const int N = B * T;
     const int grid_size = N;
-    fused_classifier_kernel5<<<grid_size, block_size, 512, main_stream>>>(logits, losses, (floatX*)NULL, dloss, targets, B, T, V, P);
+    fused_classifier_kernel5<<<grid_size, block_size, 512>>>(logits, losses, (floatX*)NULL, dloss, targets, B, T, V, P);
     cudaCheck(cudaGetLastError());
 }
 
@@ -1738,7 +1735,7 @@ void* malloc_and_point_parameters(ParameterTensors* params, size_t* param_elemen
     return params_memory;
 }
 
-#define NUM_ACTIVATION_TENSORS 20
+#define NUM_ACTIVATION_TENSORS 21
 typedef struct {
     floatX* encoded; // (B, T, C)
     floatX* ln1; // (L, B, T, C)
@@ -1758,6 +1755,7 @@ typedef struct {
     floatX* lnf; // (B, T, C)
     floatX* lnf_mean; // (B, T)
     floatX* lnf_rstd; // (B, T)
+    floatX* losses; // (B, T)
     // adding these two compared to the CPU .c code, needed for attention kernel as buffers
     floatX* qkvr; // (L, B, T, 3*C)
     // in inference mode, this buffer will store the logits
@@ -1796,8 +1794,9 @@ void fill_in_activation_sizes(size_t* act_sizes, size_t B, size_t T, GPT2Config
     act_sizes[15] = B * T * C; // lnf
     act_sizes[16] = B * T; // lnf_mean
     act_sizes[17] = B * T; // lnf_rstd
-    act_sizes[18] = L * B * T * 3*C; // qkvr
-    act_sizes[19] = B * T * max(3*C, max(NH*T, Vp)); // output / scratch
+    act_sizes[18] = B * T; // losses
+    act_sizes[19] = L * B * T * 3*C; // qkvr
+    act_sizes[20] = B * T * max(3*C, max(NH*T, Vp)); // output / scratch
 }
 
 // Backward pass is conceptually quite different from forward, because we can discard
@@ -1848,7 +1847,7 @@ void* malloc_and_point_activations(ActivationTensors* acts, const size_t* act_si
         &acts->encoded, &acts->ln1, &acts->ln1_mean, &acts->ln1_rstd, &acts->atty,
         &acts->att, &acts->attproj, &acts->residual2, &acts->ln2, &acts->ln2_mean,
         &acts->ln2_rstd, &acts->fch, &acts->fch_gelu, &acts->fcproj, &acts->residual3, &acts->lnf,
-        &acts->lnf_mean, &acts->lnf_rstd, &acts->qkvr, &acts->output
+        &acts->lnf_mean, &acts->lnf_rstd, &acts->losses, &acts->qkvr, &acts->output
     };
     return malloc_and_point(ptrs, act_sizes, NUM_ACTIVATION_TENSORS);
 }
@@ -1980,7 +1979,7 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
     model->use_master_weights = 1; // keep master weights copy in float for optim update?
 }
 
-void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bool get_loss=true, int grad_accum_steps=1) {
+void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, int grad_accum_steps=1) {
     NVTX_RANGE_FN();
     // targets are optional and could be NULL
     // in this function we must be careful and use size_t instead of int, otherwise
@@ -2038,7 +2037,6 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo
     // todo - inputs is copied on default stream so this synchronises CPU/GPU for now
     cudaCheck(cudaMemcpy(model->inputs, inputs, B * T * sizeof(int), cudaMemcpyHostToDevice));
     if (targets != NULL) {
-        // memcpy targets in parallel then wait for them before fused_classifier
         cudaCheck(cudaMemcpy(model->targets, targets, B * T * sizeof(int), cudaMemcpyHostToDevice));
     }
 
@@ -2124,20 +2122,17 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo
         NvtxRange classifier_and_loss_range("classifier_and_loss");
         // fused classifier: does the forward pass and first part of the backward pass
         const float dloss = 1.0f / (B * T * grad_accum_steps); // results in the uniform average loss over all elements
-        fused_classifier(acts.output, model->cpu_losses, dloss, model->targets, B, T, V, Vp);
-        // reset mean_loss here so gpt2_backward() knows we have targets
-        model->mean_loss = 0.0f;
+        fused_classifier(acts.output, acts.losses, dloss, model->targets, B, T, V, Vp);
+        // for convenience also evaluate the mean loss (TODO re-think this compute+sync point)
+        cudaCheck(cudaMemcpy(model->cpu_losses, acts.losses, B * T * sizeof(floatX), cudaMemcpyDeviceToHost));
+        float mean_loss = 0.0f;
+        for (int i = 0; i < B*T; i++) { mean_loss += (float)(model->cpu_losses[i]); }
+        mean_loss /= B*T*grad_accum_steps;
+        model->mean_loss = mean_loss;
     } else {
         // if we don't have targets, we don't have loss
         model->mean_loss = -1.0f;
     }
-
-    // accumulate the loss immediately if we are not going to run gpt2_backward(), e.g. inference
-    if (get_loss) {
-        assert(targets != NULL); // makes no sense to request loss if we don't have targets
-        for (int i=0; i<B*T; i++) { model->mean_loss += (float)(model->cpu_losses[i]); }
-        model->mean_loss /= B*T*grad_accum_steps;
-    }
 }
 
 void gpt2_zero_grad(GPT2 *model) {
@@ -2280,10 +2275,6 @@ void gpt2_backward(GPT2 *model) {
         layernorm_backward(dresidual, dl_ln1w, dl_ln1b, scratchF, dl_btc, residual, l_ln1w, l_ln1_mean, l_ln1_rstd, B, T, C);
     }
     encoder_backward(grads.wte, grads.wpe, dresidual, model->inputs, B, T, C, random_u32(&model->rng_state));
-
-    // accumulate the loss, this was calculated at the end of gpt2_forward()
-    for (int i=0; i<B*T; i++) { model->mean_loss += (float)(model->cpu_losses[i]); }
-    model->mean_loss /= B*T;
 }
 
 // Compute a mean of a single CPU value across all GPU processes. No-op when multi-GPU is disabled.
@@ -2303,7 +2294,7 @@ float multi_gpu_cpu_float_mean(float value, const MultiGpuConfig* multi_gpu_conf
 void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) {
 #ifdef MULTI_GPU
     NVTX_RANGE_FN();
-    if (multi_gpu_config->num_processes == 1) return;
+    if (multi_gpu_config->num_processes == 1) { return; }
     // Average all losses.
     model->accumulated_mean_loss = multi_gpu_cpu_float_mean(model->mean_loss, multi_gpu_config);
     // Average all gradients.
@@ -2311,7 +2302,7 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) {
         model->num_parameters,
         ncclFloatX, ncclAvg,
         multi_gpu_config->nccl_comm,
-        main_stream));
+        0));
 #endif
 }
 
@@ -2330,7 +2321,7 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
         if (model->use_master_weights == 1) {
             // allocate one more buffer to keep the master copy of weights as float, and copy the weights over
             cudaCheck(cudaMalloc((void**)&model->master_weights, model->num_parameters * sizeof(float)));
-            copy_and_cast_kernel<<<CEIL_DIV(model->num_parameters, 512), 512, 0, main_stream>>>(model->master_weights, (floatX*)model->params_memory, model->num_parameters);
+            copy_and_cast_kernel<<<CEIL_DIV(model->num_parameters, 512), 512>>>(model->master_weights, (floatX*)model->params_memory, model->num_parameters);
             cudaCheck(cudaGetLastError());
             printf0("allocated %zu MiB for master copy of params\n", (model->num_parameters * sizeof(float)) >> 20);
         }
@@ -2341,7 +2332,7 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
     float beta1_correction = 1.0f - powf(beta1, t);
     float beta2_correction = 1.0f - powf(beta2, t);
     unsigned int seed = random_u32(&model->rng_state);
-    adamw_kernel3<<<num_blocks, block_size, 0, main_stream>>>((floatX*)model->params_memory, model->master_weights,
+    adamw_kernel3<<<num_blocks, block_size>>>((floatX*)model->params_memory, model->master_weights,
                                               (floatX*)model->grads_memory, model->m_memory, model->v_memory,
                                               model->num_parameters,
                                               learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed);
@@ -2363,7 +2354,7 @@ void gpt2_multi_gpu_update(GPT2 *model, float learning_rate, float beta1, float
         printf0("allocated %zu MiB for AdamW optimizer state v\n", (num_parameters * sizeof(float)) >> 20);
         if (model->use_master_weights == 1) {
             cudaCheck(cudaMalloc((void**)&model->master_weights, num_parameters * sizeof(float)));
-            copy_and_cast_kernel<<<CEIL_DIV(num_parameters, 512), 512, 0, main_stream>>>(model->master_weights, params_memory, num_parameters);
+            copy_and_cast_kernel<<<CEIL_DIV(num_parameters, 512), 512>>>(model->master_weights, params_memory, num_parameters);
             cudaCheck(cudaGetLastError());
             printf0("allocated %zu MiB for master copy of params\n", (num_parameters * sizeof(float)) >> 20);
         }
@@ -2374,7 +2365,7 @@ void gpt2_multi_gpu_update(GPT2 *model, float learning_rate, float beta1, float
     float beta1_correction = 1.0f - powf(beta1, t);
     float beta2_correction = 1.0f - powf(beta2, t);
     unsigned int seed = random_u32(&model->rng_state);
-    adamw_kernel3<<<num_blocks, block_size, 0, main_stream>>>(params_memory, model->master_weights, grads_memory,
+    adamw_kernel3<<<num_blocks, block_size>>>(params_memory, model->master_weights, grads_memory,
                                                               model->m_memory, model->v_memory, num_parameters,
                                                               learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed);
     cudaCheck(cudaGetLastError());
@@ -2383,8 +2374,7 @@ void gpt2_multi_gpu_update(GPT2 *model, float learning_rate, float beta1, float
 void gpt2_multi_gpu_gather(GPT2 *model, MultiGpuConfig* multi_gpu_config)
 {
 #ifdef MULTI_GPU
-    if (multi_gpu_config->num_processes == 1) return;
-
+    if (multi_gpu_config->num_processes == 1) { return; } // 1 process => noop
     if (multi_gpu_config->zero_stage == 1) {
         // gather updated shards of model->params_memory from each process
         ncclCheck(ncclAllGather((floatX*)model->params_memory + multi_gpu_config->shard_offset, (floatX*)model->params_memory,
@@ -2417,11 +2407,8 @@ void common_start(bool override_enable_tf32 = true, bool print_device_info = tru
         printf("Device %d: %s\n", multi_gpu_config.local_device_idx, deviceProp.name);
     }
 
-    cudaCheck(cudaStreamCreate(&main_stream));
-
     // set up cuBLAS and cuBLASLt (and cuDNN if enabled)
     cublasCheck(cublasCreate(&cublas_handle));
-    cublasCheck(cublasSetStream(cublas_handle, main_stream));
     cublasCheck(cublasLtCreate(&cublaslt_handle));
     cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size));
 
@@ -2436,7 +2423,6 @@ void common_start(bool override_enable_tf32 = true, bool print_device_info = tru
 }
 
 void common_free(GPT2 &model) {
-    cudaCheck(cudaStreamDestroy(main_stream));
     gpt2_free(&model);
     cudaCheck(cudaFree(cublaslt_workspace));
     cublasCheck(cublasDestroy(cublas_handle));
@@ -2785,7 +2771,7 @@ int main(int argc, char *argv[]) {
                 // we re-calculate the forward pass for all of (B,T) positions from scratch
                 // but the inference here is just for sanity checking anyway
                 // and we can maybe optimize a bit more later, with careful tests
-                gpt2_forward(&model, gen_tokens, NULL, B, T, false);
+                gpt2_forward(&model, gen_tokens, NULL, B, T);
                 // furthermore, below we're only using b=0 (i.e. the first row) of all B rows
                 // we're in principle running B "inference streams" in parallel here
                 // only using position 0 because it's a bit faster (copy less probs from GPU -> CPU)
@@ -2833,7 +2819,7 @@ int main(int argc, char *argv[]) {
                 dataloader_next_batch(&train_loader);
             }
             // forward pass. note that we pass in grad_accum_steps, which scales down the loss
-            gpt2_forward(&model, train_loader.inputs, train_loader.targets, B, T, true, grad_accum_steps);
+            gpt2_forward(&model, train_loader.inputs, train_loader.targets, B, T, grad_accum_steps);
             lossf += model.mean_loss; // the mean_loss was normalized by grad_accum_steps inside gpt2_forward
             // backward pass. all model params accumulate gradients with += inside this inner loop
             gpt2_backward(&model);

From 2ccdfb70e0656014e30ab2843d4c539259f809a4 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Tue, 14 May 2024 10:44:12 +0300
Subject: [PATCH 080/172] general cleanup

---
 dev/cuda/matmul_backward_bias.cu | 66 ++++++++++++++++++--------------
 1 file changed, 38 insertions(+), 28 deletions(-)

diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu
index 0bf5e44dd..820741b0f 100644
--- a/dev/cuda/matmul_backward_bias.cu
+++ b/dev/cuda/matmul_backward_bias.cu
@@ -324,45 +324,50 @@ __global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout
 // kernel launcher
 
 // version1: simple cuBLAS calls
-void matmul_backward_bias1(floatX* dbias, floatX* dout,
-                      int B, int T, int C, int OC, int block_size) {
+void matmul_backward_bias1(floatX* dbias, const floatX* dout,
+                      int B, int T, int OC, int block_size) {
     dim3 block_dim(block_size);
     dim3 grid_dim(OC);
     size_t shared_mem_size = block_size * sizeof(float);
     matmul_backward_bias_kernel1<<<grid_dim, block_dim, shared_mem_size>>>(dbias, dout, B, T, OC);
+    cudaCheck(cudaGetLastError());
 }
 
-void matmul_backward_bias2(floatX* dbias, floatX* dout,
-                      int B, int T, int C, int OC, int block_size) {
+void matmul_backward_bias2(floatX* dbias, const floatX* dout,
+                      int B, int T, int OC, int block_size) {
     // block_size 512 seems best
     const int grid_size = ceil_div(OC * 32, block_size);
     matmul_backward_bias_kernel2<<<grid_size, block_size>>>(dbias, dout, B, T, OC);
+    cudaCheck(cudaGetLastError());
 }
 
-void matmul_backward_bias3(floatX* dbias, floatX* dout,
-                      int B, int T, int C, int OC, int block_size) {
+void matmul_backward_bias3(floatX* dbias, const floatX* dout,
+                      int B, int T, int OC, int block_size) {
     // block_size 256 seems best
     matmul_backward_bias_kernel3<<<OC, block_size>>>(dbias, dout, B, T, OC);
+    cudaCheck(cudaGetLastError());
 }
 
-void matmul_backward_bias4(floatX* dbias, floatX* dout,
-                      int B, int T, int C, int OC, int block_size) {
+void matmul_backward_bias4(floatX* dbias, const floatX* dout,
+                      int B, int T, int OC, int block_size) {
     assert(OC % 32 == 0); // OC must be divisible by 32 for this kernel
     const int grid_size = OC / 32;
     matmul_backward_bias_kernel4<<<grid_size, block_size, block_size * sizeof(float)>>>(dbias, dout, B, T, OC);
+    cudaCheck(cudaGetLastError());
 }
 
 #ifndef ENABLE_BF16
-void matmul_backward_bias5(floatX* dbias, floatX* dout,
-                      int B, int T, int C, int OC, int block_size) {
+void matmul_backward_bias5(floatX* dbias, const floatX* dout,
+                      int B, int T, int OC, int block_size) {
     const int grid_size_x = ceil_div(OC, block_size);
     const int grid_size_y = max(1, cuda_threads_per_SM * cuda_num_SMs / block_size);
     matmul_backward_bias_kernel5<<<dim3(grid_size_x, grid_size_y), dim3(block_size)>>>(dbias, dout, B, T, OC);
+    cudaCheck(cudaGetLastError());
 }
 #endif
 
-void matmul_backward_bias7(floatX* dbias, floatX* dout,
-                      int B, int T, int C, int OC, int block_size) {
+void matmul_backward_bias7(floatX* dbias, const floatX* dout,
+                      int B, int T, int OC, int block_size) {
     if(block_size < 256) {
         block_size = 256;
     }
@@ -381,14 +386,16 @@ void matmul_backward_bias7(floatX* dbias, floatX* dout,
 
     assert(block_size_y >= x128::size); // part of the kernel assumes this is large enough to avoid loops
 
-    cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float));
+    cudaCheck(cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float)));
     matmul_backward_bias_kernel7<<<dim3(grid_size_x, grid_size_y),
-    dim3(block_size_x, block_size_y), OC_per_warp * sizeof(float)>>>(dbias_buffer, dout, B, T, OC, block_size);
+        dim3(block_size_x, block_size_y), OC_per_warp * sizeof(float)>>>(dbias_buffer, dout, B, T, OC, block_size);
+    cudaCheck(cudaGetLastError());
     cast_and_add_kernel<<<ceil_div(OC, 256), 256, 0>>>(dbias, dbias_buffer, OC);
+    cudaCheck(cudaGetLastError());
 }
 
-void matmul_backward_bias8(floatX* dbias, floatX* dout,
-                      int B, int T, int C, int OC, int block_size) {
+void matmul_backward_bias8(floatX* dbias, const floatX* dout,
+                      int B, int T, int OC, int block_size) {
     dim3 block_dim = {4, 8, (unsigned)block_size/32};
     const int OC_per_warp = block_dim.y * x128::size; // 64 at BF16
     const int grid_size_x = ceil_div(OC, OC_per_warp); // e.g. 12 horizontal blocks for 768 OCs at BF16
@@ -398,41 +405,44 @@ void matmul_backward_bias8(floatX* dbias, floatX* dout,
     // and write results directly to the output.
     if(grid_size_y == 1) {
         matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias, dout, B, T, OC, std::bool_constant<false>{});
+        cudaCheck(cudaGetLastError());
     } else {
-        cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float));
+        cudaCheck(cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float)));
         matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias_buffer, dout, B, T, OC, std::bool_constant<true>{});
+        cudaCheck(cudaGetLastError());
         cast_and_add_kernel<<<ceil_div(OC, 256), 256, 0>>>(dbias, dbias_buffer, OC);
+        cudaCheck(cudaGetLastError());
     }
 }
 
 void matmul_backward_bias(int kernel_num, floatX* dbias, floatX* dout,
-                     int B, int T, int C, int OC, int block_size) {
+                     int B, int T, int OC, int block_size) {
     switch (kernel_num) {
         case 1:
-            matmul_backward_bias1(dbias, dout, B, T, C, OC, block_size);
+            matmul_backward_bias1(dbias, dout, B, T, OC, block_size);
             break;
         case 2:
-            matmul_backward_bias2(dbias, dout, B, T, C, OC, block_size);
+            matmul_backward_bias2(dbias, dout, B, T, OC, block_size);
             break;
         case 3:
-            matmul_backward_bias3(dbias, dout,  B, T, C, OC, block_size);
+            matmul_backward_bias3(dbias, dout,  B, T, OC, block_size);
             break;
         case 4:
-            matmul_backward_bias4(dbias, dout, B, T, C, OC, block_size);
+            matmul_backward_bias4(dbias, dout, B, T, OC, block_size);
             break;
         case 5:
 #ifndef ENABLE_BF16
-            matmul_backward_bias5(dbias, dout, B, T, C, OC, block_size);
+            matmul_backward_bias5(dbias, dout, B, T, OC, block_size);
 #else
             fprintf(stderr, "Kernel 5 is only supported for fp32");
             exit(1);
 #endif
             break;
         case 7:
-            matmul_backward_bias7(dbias, dout, B, T, C, OC, block_size);
+            matmul_backward_bias7(dbias, dout, B, T, OC, block_size);
             break;
         case 8:
-            matmul_backward_bias8(dbias, dout, B, T, C, OC, block_size);
+            matmul_backward_bias8(dbias, dout, B, T, OC, block_size);
             break;
         default:
             printf("Invalid kernel number\n");
@@ -466,7 +476,7 @@ int main(int argc, char **argv) {
     floatX* d_dout;
     cudaCheck(cudaMalloc(&d_dbias, OC * sizeof(floatX)));
     cudaCheck(cudaMalloc(&d_dout, B * T * OC * sizeof(floatX)));
-    cudaCheck(cudaMalloc(&dbias_buffer, OC * sizeof(float)));
+    cudaCheck(cudaMalloc(&dbias_buffer, OC * sizeof(float) * 32));
     cudaCheck(memcpy_convert(d_dbias, dbias, OC));
     cudaCheck(memcpy_convert(d_dout, dout, B * T * OC));
 
@@ -489,7 +499,7 @@ int main(int argc, char **argv) {
         // memset the bias to zero
         cudaCheck(cudaMemset(d_dbias, 0, OC * sizeof(floatX)));
         // calculate the GPU version
-        matmul_backward_bias(kernel_num, d_dbias, d_dout, B, T, C, OC, block_size);
+        matmul_backward_bias(kernel_num, d_dbias, d_dout, B, T, OC, block_size);
         // compare
         printf("Checking correctness...\n");
         float tol = std::is_same_v<floatX, float> ? 5e-3f : 1.0f;
@@ -502,7 +512,7 @@ int main(int argc, char **argv) {
         int block_size = block_sizes[j];
         int repeat_times = 2000;
         float elapsed_time = benchmark_kernel(repeat_times, matmul_backward_bias, kernel_num,
-                                            d_dbias, d_dout, B, T, C, OC, block_size);
+                                            d_dbias, d_dout, B, T, OC, block_size);
         printf("block_size %d time %.4f ms\n", block_size, elapsed_time);
     }
 

From 858c6e6dae447470e716bd1f0f47931980ebb4f6 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Tue, 14 May 2024 18:40:57 +0300
Subject: [PATCH 081/172] deterministic kernel

---
 dev/cuda/matmul_backward_bias.cu | 120 +++++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)

diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu
index 820741b0f..12b167083 100644
--- a/dev/cuda/matmul_backward_bias.cu
+++ b/dev/cuda/matmul_backward_bias.cu
@@ -320,6 +320,101 @@ __global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout
     }
 }
 
+// Like kernel 8, but instead of accumulating to the auxiliary buffer, it writes
+// multiple values that need to be summed up in a separate kernel call.
+// If UseAuxBuffer is false, gridDim.y has to be one, and results are added directly
+// to dbias.
+template<typename OutFloat, bool UseAuxBuffer>
+__global__ void matmul_backward_bias_kernel9(OutFloat* dbias, const floatX* dout, int B, int T, int OC,
+                                             std::bool_constant<UseAuxBuffer>) {
+    constexpr const int bdx = 4;
+    constexpr const int bdy = 32 / bdx;
+    assert(blockDim.x == bdx);
+    assert(blockDim.y == bdy);
+
+    int warp_d = (int)threadIdx.x;
+    int warp_c = (int)threadIdx.y;
+    int block_d = (int)threadIdx.z;
+
+    const int OC_per_warp = bdy * x128::size;  // 64 at BF16
+
+    int local_oc = warp_c * x128::size;
+    int global_oc = blockIdx.x * OC_per_warp + local_oc;
+
+    int local_bt = warp_d + bdx * block_d;
+    int bt_per_block = bdx * blockDim.z;
+
+    float accumulators[x128::size];
+    for (int k = 0; k < x128::size; k++) {
+        accumulators[k] = 0.0f;
+    }
+
+    if(global_oc < OC) {
+        // sum up over all bt within registers
+        for (int idx = blockIdx.y * bt_per_block + local_bt; idx < B * T; idx += gridDim.y * bt_per_block) {
+            x128 packed_dout = load128(dout + global_oc + idx*OC);
+            for (int k = 0; k < x128::size; k++) {
+                accumulators[k] += (float)packed_dout[k];
+            }
+        }
+    }
+
+    __shared__ float sub_results[x128::size][32][bdy];
+
+    // reduce within-warp results
+    for (int k = 0; k < x128::size; k++) {
+        float v = accumulators[k];
+        v += __shfl_down_sync(0xffffffff, v, 1, 4);
+        v += __shfl_down_sync(0xffffffff, v, 2, 4);
+        if(warp_d == 0) {
+            sub_results[k][block_d][warp_c] = v;
+        }
+    }
+    __syncthreads();
+
+    // block-wide reductions
+    for (int k = block_d; k < x128::size; k += blockDim.z) {
+        float a = 0.f;
+        for (int r = warp_d; r < blockDim.z; r += bdx) {
+            float v = sub_results[k][r][warp_c];
+            v += __shfl_down_sync(0xffffffff, v, 1, 4);
+            v += __shfl_down_sync(0xffffffff, v, 2, 4);
+            a += v;
+        }
+        if(warp_d == 0 && global_oc < OC) {
+            // coalesced, but not cacheline-sized
+            if constexpr (!UseAuxBuffer) {
+                dbias[global_oc + k] = (OutFloat)(a + (float)dbias[global_oc + k]);
+            } else {
+                dbias[global_oc + k + blockIdx.y * OC] = a;
+            }
+        }
+    }
+}
+
+
+__global__ void reduce_add_sum_kernel(floatX* dst, const float* src, size_t n, size_t m) {
+    const size_t idx = (blockIdx.x * blockDim.x + threadIdx.x) * f128::size;
+    assert(n % x128::size == 0);
+    if (idx < n) {
+        f128 acc;
+        for(int k = 0; k < f128::size; ++k) {
+            acc[k] = 0.f;
+        }
+
+        for(int l = 0; l < m; ++l) {
+            f128 s = load128(src + idx + n * l);
+            for(int k = 0; k < f128::size; ++k) {
+                acc[k] += s[k];
+            }
+        }
+        for(int k = 0; k < f128::size; ++k) {
+            dst[idx + k] = (floatX) ((float)dst[idx + k] + acc[k]);
+        }
+    }
+}
+
+
 // ----------------------------------------------------------------------------
 // kernel launcher
 
@@ -415,6 +510,28 @@ void matmul_backward_bias8(floatX* dbias, const floatX* dout,
     }
 }
 
+
+void matmul_backward_bias9(floatX* dbias, const floatX* dout,
+                           int B, int T, int OC, int block_size) {
+    dim3 block_dim = {4, 8, (unsigned)block_size/32};
+    const int OC_per_warp = block_dim.y * x128::size; // 64 at BF16
+    const int grid_size_x = ceil_div(OC, OC_per_warp); // e.g. 12 horizontal blocks for 768 OCs at BF16
+    const int grid_size_y = max(1, cuda_threads_per_SM * cuda_num_SMs / (block_size * grid_size_x)); // full GPU!
+
+    // If we have enough OC that we don't need cross-block reductions, we can skip the bias_buffer accumulation
+    // and write results directly to the output.
+    if(grid_size_y == 1) {
+        matmul_backward_bias_kernel9<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias, dout, B, T, OC, std::bool_constant<false>{});
+        cudaCheck(cudaGetLastError());
+    } else {
+        // kernel 9 overwrites temp buffer, so no need to memset
+        matmul_backward_bias_kernel9<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias_buffer, dout, B, T, OC, std::bool_constant<true>{});
+        cudaCheck(cudaGetLastError());
+        reduce_add_sum_kernel<<<ceil_div(OC, 256 * f128::size), 256, 0>>>(dbias, dbias_buffer, OC, grid_size_y);
+        cudaCheck(cudaGetLastError());
+    }
+}
+
 void matmul_backward_bias(int kernel_num, floatX* dbias, floatX* dout,
                      int B, int T, int OC, int block_size) {
     switch (kernel_num) {
@@ -444,6 +561,9 @@ void matmul_backward_bias(int kernel_num, floatX* dbias, floatX* dout,
         case 8:
             matmul_backward_bias8(dbias, dout, B, T, OC, block_size);
             break;
+        case 9:
+            matmul_backward_bias9(dbias, dout, B, T, OC, block_size);
+            break;
         default:
             printf("Invalid kernel number\n");
             exit(1);

From 7b810c1a3bf821d65bebd344b76c7c518392bd2a Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Thu, 16 May 2024 00:20:52 +0300
Subject: [PATCH 082/172] update main training script

---
 train_gpt2.cu | 55 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 22 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index b53f7712c..07d6b0018 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -896,13 +896,9 @@ __global__ void gelu_backward_kernel(floatX* dinp, const floatX* inp, const floa
     store128(dinp + idx, packed_dinp);
 }
 
-// templated because if we have enough channels, we can write directly to the bf16 dbias buffer, and otherwise
-// we need to write to a fp32 temp buffer. The `Atomic` argument indicates whether we add atomically. We cannot
-// (easily) use a regular runtime `if(blockDim.y == 1)` runtime condition, because that doesn't compile for older
-// GPUs.
-template<typename OutFloat, bool Atomic>
-__global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout, int B, int T, int OC,
-                                             std::bool_constant<Atomic>) {
+template<typename OutFloat, bool UseAuxBuffer>
+__global__ void matmul_backward_bias_kernel9(OutFloat* dbias, const floatX* dout, int B, int T, int OC,
+                                             std::bool_constant<UseAuxBuffer>) {
     constexpr const int bdx = 4;
     constexpr const int bdy = 32 / bdx;
     assert(blockDim.x == bdx);
@@ -957,19 +953,37 @@ __global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout
             v += __shfl_down_sync(0xffffffff, v, 2, 4);
             a += v;
         }
-
-        // coalesced, but not cacheline-sized writes
         if(warp_d == 0 && global_oc < OC) {
-            // if we have only one block per result, no need for atomics
-            if constexpr (!Atomic) {
+            if constexpr (!UseAuxBuffer) {
                 dbias[global_oc + k] = (OutFloat)(a + (float)dbias[global_oc + k]);
             } else {
-                atomicAdd(dbias + global_oc + k, a);
+                dbias[global_oc + k + blockIdx.y * OC] = a;
             }
         }
     }
 }
 
+__global__ void reduce_add_sum_kernel(floatX* dst, const float* src, size_t n, size_t m) {
+    const size_t idx = (blockIdx.x * blockDim.x + threadIdx.x) * f128::size;
+    assert(n % x128::size == 0);
+    if (idx < n) {
+        f128 acc;
+        for(int k = 0; k < f128::size; ++k) {
+            acc[k] = 0.f;
+        }
+
+        for(int l = 0; l < m; ++l) {
+            f128 s = load128(src + idx + n * l);
+            for(int k = 0; k < f128::size; ++k) {
+                acc[k] += s[k];
+            }
+        }
+        for(int k = 0; k < f128::size; ++k) {
+            dst[idx + k] = (floatX) ((float)dst[idx + k] + acc[k]);
+        }
+    }
+}
+
 __global__ void __launch_bounds__(512, 3) // todo - any warnings on Turing with only 1024 threads?
                 layernorm_backward_kernel8(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
                                             const floatX* dout, const floatX* inp, const floatX* weight,
@@ -1308,12 +1322,6 @@ __global__ void copy_and_cast_kernel(Td* dst, const Ts* src, size_t n) {
     }
 }
 
-__global__ void cast_and_add_kernel(floatX* dst, const float* src, size_t n) {
-    // used only for matmul_backward_bias kernel, a little bit embarassing TODO delete later
-    const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < n) { dst[idx] = (floatX)((float)dst[idx] + src[idx]); } // have to += because dbias is a paramater
-}
-
 // ----------------------------------------------------------------------------
 // kernel launchers
 
@@ -1548,11 +1556,14 @@ void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias,
         // If we have enough OC that we don't need cross-block reductions, we can skip the bias_buffer accumulation
         // and write results directly to the output.
         if(grid_size_y == 1) {
-            matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias, dout, B, T, OC, std::bool_constant<false>{});
+            matmul_backward_bias_kernel9<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias, dout, B, T, OC, std::bool_constant<false>{});
+            cudaCheck(cudaGetLastError());
         } else {
-            cudaMemset(dbias_buffer, 0, OC * sizeof(float));
-            matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias_buffer, dout, B, T, OC, std::bool_constant<true>{});
-            cast_and_add_kernel<<<CEIL_DIV(OC, 256), 256>>>(dbias, dbias_buffer, OC);
+            // kernel 9 overwrites temp buffer, so no need to memset
+            matmul_backward_bias_kernel9<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias_buffer, dout, B, T, OC, std::bool_constant<true>{});
+            cudaCheck(cudaGetLastError());
+            reduce_add_sum_kernel<<<CEIL_DIV(OC, 256 * f128::size), 256>>>(dbias, dbias_buffer, OC, grid_size_y);
+            cudaCheck(cudaGetLastError());
         }
     }
 

From d48c3a494542d496601c2ca683a1802097f75ec7 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Thu, 16 May 2024 12:39:28 +0300
Subject: [PATCH 083/172] (optionally) recompute gelu activations to reduce
 activation memory

---
 train_gpt2.cu | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index b53f7712c..608034add 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -1766,7 +1766,7 @@ typedef struct {
     floatX* output;
 } ActivationTensors;
 
-void fill_in_activation_sizes(size_t* act_sizes, size_t B, size_t T, GPT2Config config) {
+void fill_in_activation_sizes(size_t* act_sizes, size_t B, size_t T, GPT2Config config, bool recompute) {
     size_t Vp = config.padded_vocab_size;
     size_t L = config.num_layers;
     size_t NH = config.num_heads;
@@ -1788,7 +1788,14 @@ void fill_in_activation_sizes(size_t* act_sizes, size_t B, size_t T, GPT2Config
     act_sizes[9] = L * B * T; // ln2_mean
     act_sizes[10] = L * B * T; // ln2_rstd
     act_sizes[11] = L * B * T * 4*C; // fch
-    act_sizes[12] = L * B * T * 4*C; // fch_gelu
+    // fch_gelu; result of a pointwise op, we may want to recompute to save activation memory
+    if (recompute) {
+        // if we recompute gelus, we just use the scratch buffer here
+        act_sizes[12] = B * T * 4*C;
+    } else {
+        act_sizes[12] = L * B * T * 4*C;
+    }
+
     act_sizes[13] = L * B * T * C; // fcproj
     act_sizes[14] = L * B * T * C; // residual3
     act_sizes[15] = B * T * C; // lnf
@@ -1897,6 +1904,7 @@ typedef struct {
     floatX* cpu_losses; // CPU buffer to copy the losses to, allocated with cudaMallocHost
     unsigned long long rng_state; // the RNG state for seeding stochastic rounding etc.
     int use_master_weights;
+    int recompute_activations;
 } GPT2;
 
 void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
@@ -1977,6 +1985,7 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
     model->mean_loss = -1.0f; // -1.0f will designate no loss
     model->rng_state = 13371337;
     model->use_master_weights = 1; // keep master weights copy in float for optim update?
+    model->recompute_activations = 0;
 }
 
 void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, int grad_accum_steps=1) {
@@ -2012,7 +2021,7 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, in
         model->batch_size = B;
         model->seq_len = T;
         // allocate the space
-        fill_in_activation_sizes(model->act_sizes, B, T, model->config);
+        fill_in_activation_sizes(model->act_sizes, B, T, model->config, model->recompute_activations);
         size_t num_activations = 0;
         for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
             num_activations += model->act_sizes[i];
@@ -2075,7 +2084,12 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, in
         floatX* l_ln2_mean = acts.ln2_mean + l * B * T;
         floatX* l_ln2_rstd = acts.ln2_rstd + l * B * T;
         floatX* l_fch = acts.fch + l * B * T * 4*C;
-        floatX* l_fch_gelu = acts.fch_gelu + l * B * T * 4*C;
+        floatX* l_fch_gelu;
+        if(model->recompute_activations) {
+            l_fch_gelu = acts.fch_gelu;       // reuse the same buffer for every layer
+        } else {
+            l_fch_gelu = acts.fch_gelu + l * B * T * 4*C;
+        }
         floatX* l_fcproj = acts.fcproj + l * B * T * C;
         floatX* l_residual3 = acts.residual3 + l * B * T * C;
 
@@ -2249,6 +2263,10 @@ void gpt2_backward(GPT2 *model) {
         floatX* dl_bt4c = (floatX*)grads_acts.bt4c;
 
         // backprop this layer
+        if(model->recompute_activations) {
+            l_fch_gelu = acts.fch_gelu;
+            gelu_forward(l_fch_gelu, l_fch, B*T*4*C);
+        }
         matmul_backward(dl_bt4c, dl_fcprojw, dl_fcprojb, dresidual, l_fch_gelu, l_fcprojw, scratchF, B, T, 4*C, C);
         gelu_backward(dl_bt4c, l_fch, dl_bt4c, B*T*4*C);
         matmul_backward(dl_btc, dl_fcw, dl_fcb, dl_bt4c, l_ln2, l_fcw, scratchF, B, T, C, 4 * C);
@@ -2588,6 +2606,7 @@ void error_usage() {
     fprintf(stderr, "  -f <int>    enable_tf32 override (default: 1, set to 0 to disable tf32)\n");
     fprintf(stderr, "  -w <int>    keep f32 copy of weights for the optimizer? (default: 1)\n");
     fprintf(stderr, "  -z <int>    zero_stage, Zero Optimization Stage, 0,1,2,3 (default = 0)\n");
+    fprintf(stderr, "  -r <int>    Recompute some activations to save memory\n");
     exit(EXIT_FAILURE);
 }
 
@@ -2612,6 +2631,7 @@ int main(int argc, char *argv[]) {
     int max_steps = -1;
     int override_enable_tf32 = 1;
     int use_master_weights = 1;
+    int recompute_activations = 0;
     int zero_stage = 0; // Zero Optimization Stage for Multi-GPU training
     for (int i = 1; i < argc; i+=2) {
         if (i + 1 >= argc) { error_usage(); } // must have arg after flag
@@ -2634,6 +2654,7 @@ int main(int argc, char *argv[]) {
         else if (argv[i][1] == 'f') { override_enable_tf32 = atoi(argv[i+1]); }
         else if (argv[i][1] == 'w') { use_master_weights = atoi(argv[i+1]); }
         else if (argv[i][1] == 'z') { zero_stage = atoi(argv[i+1]); }
+        else if (argv[i][1] == 'r') { recompute_activations = atoi(argv[i+1]); }
         else { error_usage(); }
     }
     // calculate a sensible default for total batch size by assuming no gradient accumulation
@@ -2654,6 +2675,7 @@ int main(int argc, char *argv[]) {
     printf0("| genT                  | %-50d |\n", genT);
     printf0("| overfit_single_batch  | %-50d |\n", overfit_single_batch);
     printf0("| use_master_weights    | %-50s |\n", use_master_weights ? "enabled" : "disabled");
+    printf0("| recompute_activations | %-50s |\n", recompute_activations ? "enabled" : "disabled");
     printf0("+-----------------------+----------------------------------------------------+\n");
 
     common_start(override_enable_tf32, false); // common init code for train/test/profile
@@ -2670,6 +2692,7 @@ int main(int argc, char *argv[]) {
     GPT2 model;
     gpt2_build_from_checkpoint(&model, load_filename);
     model.use_master_weights = use_master_weights;
+    model.recompute_activations = recompute_activations;
     printf0("| load_filename         | %-50s |\n", load_filename);
     printf0("| max_sequence_length T | %-50d |\n", model.config.max_seq_len);
     printf0("| vocab_size V          | %-50d |\n", model.config.vocab_size);

From 57f70ea66b4dc2859c9dea7ce677f68297d252e4 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Thu, 16 May 2024 11:17:55 +0300
Subject: [PATCH 084/172] simplify multi-gpu logic by reducing #ifdefs

---
 test_gpt2.cu  |  5 +++-
 train_gpt2.cu | 79 +++++++++++++--------------------------------------
 2 files changed, 24 insertions(+), 60 deletions(-)

diff --git a/test_gpt2.cu b/test_gpt2.cu
index 654e35db1..631357476 100644
--- a/test_gpt2.cu
+++ b/test_gpt2.cu
@@ -83,6 +83,7 @@ float* float_cpu_malloc_and_point_parameters(FloatParameterTensors* params, size
 }
 
 int main(int argc, char *argv[]) {
+    multi_gpu_config = multi_gpu_config_init(&argc, &argv);
     common_start(false, true);
 
     // set the right paths
@@ -119,6 +120,8 @@ int main(int argc, char *argv[]) {
     printf("batch_size: %d\n", B);
     printf("seq_len: %d\n", T);
 
+    set_zero_configs(&multi_gpu_config, 0, model.num_parameters);
+
     // read reference information from the file saved from Python/PyTorch side
     // 1) input x and y
     int* x = (int*)mallocCheck(B * T * sizeof(int));
@@ -263,7 +266,7 @@ int main(int argc, char *argv[]) {
             allok = allok & check_tensor(tensors1[15], tensors2[15], C, "lnfb", 3e-2f);
         }
 
-        gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.01f, step+1);
+        gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.01f, step+1, &multi_gpu_config);
 
         // print the timing information at the end
         printf("step %d: loss %f (took %f ms)\n", step+1, model.mean_loss, time_elapsed_s * 1000);
diff --git a/train_gpt2.cu b/train_gpt2.cu
index b53f7712c..a8fe6a995 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -458,28 +458,26 @@ void set_zero_configs(MultiGpuConfig* multi_gpu_config, int zero_stage, size_t t
     multi_gpu_config->shard_num_parameters = total_parameters;
     multi_gpu_config->shard_offset = 0;
 
-#ifdef MULTI_GPU
-        // Check the Zero Stage and define sharding parameters
-        if (zero_stage == 0) {
-            printf0("| Zero Optimization is disabled                                              |\n");
-        }
-        else if (zero_stage == 1) {
-            if (total_parameters % multi_gpu_config->num_processes != 0) {
-                printf0("| Zero Optimization is disabled, Can't equally partition parameters          |\n");
-                multi_gpu_config->zero_stage = 0;
-            }
-            else {
-                printf0("| Zero Stage1 is enabled                                                     |\n");
-                multi_gpu_config->zero_stage = 1;
-                multi_gpu_config->shard_num_parameters = total_parameters / multi_gpu_config->num_processes;
-                multi_gpu_config->shard_offset = multi_gpu_config->process_rank * (total_parameters / multi_gpu_config->num_processes);
-            }
-        }
-        else{
-            printf0("| Disabling Zero Optimization, Zero Stage2 and Stage3 are not yet supported  |\n");
+    // Check the Zero Stage and define sharding parameters
+    if (zero_stage == 0) {
+        printf0("| Zero Optimization is disabled                                              |\n");
+    }
+    else if (zero_stage == 1) {
+        if (total_parameters % multi_gpu_config->num_processes != 0) {
+            printf0("| Zero Optimization is disabled, Can't equally partition parameters          |\n");
             multi_gpu_config->zero_stage = 0;
         }
-#endif
+        else {
+            printf0("| Zero Stage1 is enabled                                                     |\n");
+            multi_gpu_config->zero_stage = 1;
+            multi_gpu_config->shard_num_parameters = total_parameters / multi_gpu_config->num_processes;
+            multi_gpu_config->shard_offset = multi_gpu_config->process_rank * multi_gpu_config->shard_num_parameters;
+        }
+    }
+    else{
+        printf0("| Disabling Zero Optimization, Zero Stage2 and Stage3 are not yet supported  |\n");
+        multi_gpu_config->zero_stage = 0;
+    }
 }
 
 // ----------------------------------------------------------------------------
@@ -2306,40 +2304,7 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) {
 #endif
 }
 
-void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t) {
-    NVTX_RANGE_FN();
-    // reference: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
-
-    // lazily allocate the memory for m_memory and v_memory
-    if (model->m_memory == NULL) {
-        cudaCheck(cudaMalloc((void**)&model->m_memory, model->num_parameters * sizeof(float)));
-        cudaCheck(cudaMalloc((void**)&model->v_memory, model->num_parameters * sizeof(float)));
-        cudaCheck(cudaMemset(model->m_memory, 0, model->num_parameters * sizeof(float)));
-        cudaCheck(cudaMemset(model->v_memory, 0, model->num_parameters * sizeof(float)));
-        printf0("allocated %zu MiB for AdamW optimizer state m\n", (model->num_parameters * sizeof(float)) >> 20);
-        printf0("allocated %zu MiB for AdamW optimizer state v\n", (model->num_parameters * sizeof(float)) >> 20);
-        if (model->use_master_weights == 1) {
-            // allocate one more buffer to keep the master copy of weights as float, and copy the weights over
-            cudaCheck(cudaMalloc((void**)&model->master_weights, model->num_parameters * sizeof(float)));
-            copy_and_cast_kernel<<<CEIL_DIV(model->num_parameters, 512), 512>>>(model->master_weights, (floatX*)model->params_memory, model->num_parameters);
-            cudaCheck(cudaGetLastError());
-            printf0("allocated %zu MiB for master copy of params\n", (model->num_parameters * sizeof(float)) >> 20);
-        }
-    }
-
-    int block_size = 512;
-    int num_blocks = CEIL_DIV(model->num_parameters, block_size);
-    float beta1_correction = 1.0f - powf(beta1, t);
-    float beta2_correction = 1.0f - powf(beta2, t);
-    unsigned int seed = random_u32(&model->rng_state);
-    adamw_kernel3<<<num_blocks, block_size>>>((floatX*)model->params_memory, model->master_weights,
-                                              (floatX*)model->grads_memory, model->m_memory, model->v_memory,
-                                              model->num_parameters,
-                                              learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed);
-    cudaCheck(cudaGetLastError());
-}
-
-void gpt2_multi_gpu_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t, MultiGpuConfig* multi_gpu_config) {
+void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t, MultiGpuConfig* multi_gpu_config) {
     NVTX_RANGE_FN();
     size_t num_parameters = multi_gpu_config->shard_num_parameters;
     floatX* params_memory = (floatX*)model->params_memory + multi_gpu_config->shard_offset;
@@ -2828,13 +2793,9 @@ int main(int argc, char *argv[]) {
         // this is esp important to do here in multigpu update below, where model.mean_loss gets allreduced
         model.mean_loss = lossf;
         // update the parameters
-#ifndef MULTI_GPU
-        gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1);
-#else
         gpt2_multi_gpu_accumulate(&model, &multi_gpu_config);
-        gpt2_multi_gpu_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1, &multi_gpu_config);
+        gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1, &multi_gpu_config);
         gpt2_multi_gpu_gather(&model, &multi_gpu_config);
-#endif
         // zero out the gradients for the next iteration
         gpt2_zero_grad(&model);
         cudaEventRecord(end);

From 8b57cf65355c453d394231bfca478ef2d270bda5 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Thu, 16 May 2024 14:12:29 +0300
Subject: [PATCH 085/172] reduce communication overhead for ZERO stage 1

---
 train_gpt2.cu | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index a8fe6a995..04095e4d9 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2295,12 +2295,20 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) {
     if (multi_gpu_config->num_processes == 1) { return; }
     // Average all losses.
     model->accumulated_mean_loss = multi_gpu_cpu_float_mean(model->mean_loss, multi_gpu_config);
-    // Average all gradients.
-    ncclCheck(ncclAllReduce(model->grads_memory, model->grads_memory,
-        model->num_parameters,
-        ncclFloatX, ncclAvg,
-        multi_gpu_config->nccl_comm,
-        0));
+    if(multi_gpu_config->zero_stage == 0) {
+        //  no ZERO == standard DDP: Average all gradients.
+        ncclCheck(ncclAllReduce(model->grads_memory, model->grads_memory,
+                                model->num_parameters,
+                                ncclFloatX, ncclAvg,
+                                multi_gpu_config->nccl_comm, 0));
+    } else if (multi_gpu_config->zero_stage == 1) {
+        // ZERO-1: Get average gradient for local shard
+        floatX* local_grads_memory = (floatX*) model->grads_memory + multi_gpu_config->shard_offset;
+        ncclCheck(ncclReduceScatter(model->grads_memory, local_grads_memory,
+                                    multi_gpu_config->shard_num_parameters,
+                                    ncclFloatX, ncclAvg,
+                                    multi_gpu_config->nccl_comm, 0));
+    }
 #endif
 }
 

From fbd8f03eead34791550b4aaf8add5e2775e0db75 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Thu, 16 May 2024 14:25:06 +0300
Subject: [PATCH 086/172] fixup profiling

---
 profile_gpt2.cu | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/profile_gpt2.cu b/profile_gpt2.cu
index c29cd6a08..4b24c8973 100644
--- a/profile_gpt2.cu
+++ b/profile_gpt2.cu
@@ -27,7 +27,8 @@ the profile.ncu-rep from a cloud box to local to pretty view.
 #define TESTING
 #include "train_gpt2.cu"
 
-int main() {
+int main(int argc, char *argv[]) {
+    multi_gpu_config = multi_gpu_config_init(&argc, &argv);
     common_start(true, true);
 
     // build the GPT-2 model from a checkpoint
@@ -53,7 +54,7 @@ int main() {
     gpt2_forward(&model, x, y, B, T);
     gpt2_zero_grad(&model);
     gpt2_backward(&model);
-    gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1);
+    gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1, &multi_gpu_config);
     cudaCheck(cudaDeviceSynchronize()); // finish all CUDA work to get correct precise timings
 
     // free

From d7581fc5428b02d71b954ba9f9f073627c4d0a83 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Thu, 16 May 2024 19:07:39 +0000
Subject: [PATCH 087/172] make recompute be an int instead of bool, so we can
 strengthen it over time just like ZeRO stages, as we recompute more and more
 of the model in the future possibly. and make it default on because it is
 awesome

---
 train_gpt2.cu | 46 +++++++++++++++++++---------------------------
 1 file changed, 19 insertions(+), 27 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 608034add..a6dcb7d5b 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -1766,7 +1766,7 @@ typedef struct {
     floatX* output;
 } ActivationTensors;
 
-void fill_in_activation_sizes(size_t* act_sizes, size_t B, size_t T, GPT2Config config, bool recompute) {
+void fill_in_activation_sizes(size_t* act_sizes, size_t B, size_t T, GPT2Config config, int recompute) {
     size_t Vp = config.padded_vocab_size;
     size_t L = config.num_layers;
     size_t NH = config.num_heads;
@@ -1788,14 +1788,8 @@ void fill_in_activation_sizes(size_t* act_sizes, size_t B, size_t T, GPT2Config
     act_sizes[9] = L * B * T; // ln2_mean
     act_sizes[10] = L * B * T; // ln2_rstd
     act_sizes[11] = L * B * T * 4*C; // fch
-    // fch_gelu; result of a pointwise op, we may want to recompute to save activation memory
-    if (recompute) {
-        // if we recompute gelus, we just use the scratch buffer here
-        act_sizes[12] = B * T * 4*C;
-    } else {
-        act_sizes[12] = L * B * T * 4*C;
-    }
-
+    // if recompute >= 1 then we will recompute gelu_forward during backward and use this as scratch buffer
+    act_sizes[12] = (recompute == 0) ? L * B * T * 4*C : B * T * 4*C;
     act_sizes[13] = L * B * T * C; // fcproj
     act_sizes[14] = L * B * T * C; // residual3
     act_sizes[15] = B * T * C; // lnf
@@ -1904,7 +1898,7 @@ typedef struct {
     floatX* cpu_losses; // CPU buffer to copy the losses to, allocated with cudaMallocHost
     unsigned long long rng_state; // the RNG state for seeding stochastic rounding etc.
     int use_master_weights;
-    int recompute_activations;
+    int recompute;
 } GPT2;
 
 void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
@@ -1985,7 +1979,7 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
     model->mean_loss = -1.0f; // -1.0f will designate no loss
     model->rng_state = 13371337;
     model->use_master_weights = 1; // keep master weights copy in float for optim update?
-    model->recompute_activations = 0;
+    model->recompute = 1; // default to recompute gelu during backward
 }
 
 void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, int grad_accum_steps=1) {
@@ -2021,7 +2015,7 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, in
         model->batch_size = B;
         model->seq_len = T;
         // allocate the space
-        fill_in_activation_sizes(model->act_sizes, B, T, model->config, model->recompute_activations);
+        fill_in_activation_sizes(model->act_sizes, B, T, model->config, model->recompute);
         size_t num_activations = 0;
         for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
             num_activations += model->act_sizes[i];
@@ -2084,12 +2078,9 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, in
         floatX* l_ln2_mean = acts.ln2_mean + l * B * T;
         floatX* l_ln2_rstd = acts.ln2_rstd + l * B * T;
         floatX* l_fch = acts.fch + l * B * T * 4*C;
-        floatX* l_fch_gelu;
-        if(model->recompute_activations) {
-            l_fch_gelu = acts.fch_gelu;       // reuse the same buffer for every layer
-        } else {
-            l_fch_gelu = acts.fch_gelu + l * B * T * 4*C;
-        }
+        // reuse the same activation buffer at each layer, as we'll re-compute the gelu during backward
+        // very useful because we dramatically reduce VRAM usage, and may be able to fit larger batch size
+        floatX* l_fch_gelu = (model->recompute == 0) ? acts.fch_gelu + l * B * T * 4*C : acts.fch_gelu;
         floatX* l_fcproj = acts.fcproj + l * B * T * C;
         floatX* l_residual3 = acts.residual3 + l * B * T * C;
 
@@ -2252,7 +2243,7 @@ void gpt2_backward(GPT2 *model) {
         floatX* l_ln2_mean = acts.ln2_mean + l * B * T;
         floatX* l_ln2_rstd = acts.ln2_rstd + l * B * T;
         floatX* l_fch = acts.fch + l * B * T * 4*C;
-        floatX* l_fch_gelu = acts.fch_gelu + l * B * T * 4*C;
+        floatX* l_fch_gelu = (model->recompute == 0) ? acts.fch_gelu + l * B * T * 4*C : acts.fch_gelu;
         // get the pointers of the gradients of the activations for this layer
         // notice that there is no l *, because we just have a single copy, and keep
         // re-using this memory in every Transformer block as we calculate backward pass
@@ -2262,9 +2253,10 @@ void gpt2_backward(GPT2 *model) {
         floatX* dl_btc = (floatX*)acts.lnf;
         floatX* dl_bt4c = (floatX*)grads_acts.bt4c;
 
-        // backprop this layer
-        if(model->recompute_activations) {
-            l_fch_gelu = acts.fch_gelu;
+        // start the backward pass for this layer
+        if(model->recompute >= 1) {
+            // recompute >= 1 means we recompute gelu. in this case,
+            // l_fch_gelu is just a buffer, so re-compute the gelu from l_fch here
             gelu_forward(l_fch_gelu, l_fch, B*T*4*C);
         }
         matmul_backward(dl_bt4c, dl_fcprojw, dl_fcprojb, dresidual, l_fch_gelu, l_fcprojw, scratchF, B, T, 4*C, C);
@@ -2606,7 +2598,7 @@ void error_usage() {
     fprintf(stderr, "  -f <int>    enable_tf32 override (default: 1, set to 0 to disable tf32)\n");
     fprintf(stderr, "  -w <int>    keep f32 copy of weights for the optimizer? (default: 1)\n");
     fprintf(stderr, "  -z <int>    zero_stage, Zero Optimization Stage, 0,1,2,3 (default = 0)\n");
-    fprintf(stderr, "  -r <int>    Recompute some activations to save memory\n");
+    fprintf(stderr, "  -r <int>    recompute: saves memory at cost of speed. (default = 1), 0 = none. 1 = recompute gelu\n");
     exit(EXIT_FAILURE);
 }
 
@@ -2631,7 +2623,7 @@ int main(int argc, char *argv[]) {
     int max_steps = -1;
     int override_enable_tf32 = 1;
     int use_master_weights = 1;
-    int recompute_activations = 0;
+    int recompute = 1; // recompute during backward setting, 0 = none, 1 = recompute gelu
     int zero_stage = 0; // Zero Optimization Stage for Multi-GPU training
     for (int i = 1; i < argc; i+=2) {
         if (i + 1 >= argc) { error_usage(); } // must have arg after flag
@@ -2654,7 +2646,7 @@ int main(int argc, char *argv[]) {
         else if (argv[i][1] == 'f') { override_enable_tf32 = atoi(argv[i+1]); }
         else if (argv[i][1] == 'w') { use_master_weights = atoi(argv[i+1]); }
         else if (argv[i][1] == 'z') { zero_stage = atoi(argv[i+1]); }
-        else if (argv[i][1] == 'r') { recompute_activations = atoi(argv[i+1]); }
+        else if (argv[i][1] == 'r') { recompute = atoi(argv[i+1]); }
         else { error_usage(); }
     }
     // calculate a sensible default for total batch size by assuming no gradient accumulation
@@ -2675,7 +2667,7 @@ int main(int argc, char *argv[]) {
     printf0("| genT                  | %-50d |\n", genT);
     printf0("| overfit_single_batch  | %-50d |\n", overfit_single_batch);
     printf0("| use_master_weights    | %-50s |\n", use_master_weights ? "enabled" : "disabled");
-    printf0("| recompute_activations | %-50s |\n", recompute_activations ? "enabled" : "disabled");
+    printf0("| recompute             | %-50d |\n", recompute);
     printf0("+-----------------------+----------------------------------------------------+\n");
 
     common_start(override_enable_tf32, false); // common init code for train/test/profile
@@ -2692,7 +2684,7 @@ int main(int argc, char *argv[]) {
     GPT2 model;
     gpt2_build_from_checkpoint(&model, load_filename);
     model.use_master_weights = use_master_weights;
-    model.recompute_activations = recompute_activations;
+    model.recompute = recompute;
     printf0("| load_filename         | %-50s |\n", load_filename);
     printf0("| max_sequence_length T | %-50d |\n", model.config.max_seq_len);
     printf0("| vocab_size V          | %-50d |\n", model.config.vocab_size);

From 3113656e3f1009264e1c2315947685d87e02f769 Mon Sep 17 00:00:00 2001
From: Anthony Blake <anthonix@me.com>
Date: Thu, 16 May 2024 13:30:45 -0700
Subject: [PATCH 088/172] Add link to AMD fork

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 41cef25af..469326fbf 100644
--- a/README.md
+++ b/README.md
@@ -342,6 +342,9 @@ Lastly, I will be a lot more sensitive to complexity in the root folder of the p
 
 ## notable forks
 
+- AMD support
+  - [llm.c](https://github.com/anthonix/llm.c) by @[anthonix](https://github.com/anthonix): support for AMD devices, such as the 7900 XTX
+
 - C#
   - [llm.cs](https://github.com/azret/llm.cs) by @[azret](https://github.com/azret): a C# port of this project
   - [Llm.cs](https://github.com/nietras/Llm.cs) by @[nietras](https://github.com/nietras): a C# port of this project with focus on easy to get started on any platform. Clone and run ✅

From 0f23723ae4f5916cfcc3367a59b2fa5b12a018e5 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Fri, 17 May 2024 00:01:38 +0300
Subject: [PATCH 089/172] joined optimizer state allocation

---
 train_gpt2.cu | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 18f08990b..d49d27d6f 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2340,12 +2340,11 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
     floatX* grads_memory = (floatX*)model->grads_memory + multi_gpu_config->shard_offset;
 
     if (model->m_memory == NULL) {
-        cudaCheck(cudaMalloc((void**)&model->m_memory, num_parameters * sizeof(float)));
-        cudaCheck(cudaMalloc((void**)&model->v_memory, num_parameters * sizeof(float)));
-        cudaCheck(cudaMemset(model->m_memory, 0, num_parameters * sizeof(float)));
-        cudaCheck(cudaMemset(model->v_memory, 0, num_parameters * sizeof(float)));
-        printf0("allocated %zu MiB for AdamW optimizer state m\n", (num_parameters * sizeof(float)) >> 20);
-        printf0("allocated %zu MiB for AdamW optimizer state v\n", (num_parameters * sizeof(float)) >> 20);
+        size_t alloc_bytes = 2 * num_parameters * sizeof(float);
+        cudaCheck(cudaMalloc((void**)&model->m_memory, alloc_bytes));
+        model->v_memory = model->m_memory + num_parameters;
+        cudaCheck(cudaMemset(model->m_memory, 0, alloc_bytes));
+        printf0("allocated %zu MiB for AdamW optimizer state\n", alloc_bytes >> 20);
         if (model->use_master_weights == 1) {
             cudaCheck(cudaMalloc((void**)&model->master_weights, num_parameters * sizeof(float)));
             copy_and_cast_kernel<<<CEIL_DIV(num_parameters, 512), 512>>>(model->master_weights, params_memory, num_parameters);

From 88c3bea890bbee54901ae571b4a887b91546f504 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Fri, 17 May 2024 00:57:09 +0300
Subject: [PATCH 090/172] print message before actual allocation for more
 informative OOM behaviour

---
 train_gpt2.cu | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index d49d27d6f..91fe1ca72 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2030,8 +2030,8 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, in
             num_activations += model->act_sizes[i];
         }
         model->num_activations = num_activations;
+        printf0("allocating %d MiB for activations\n", (int)round(num_activations * sizeof(floatX) / (1024 * 1024)));
         model->acts_memory = malloc_and_point_activations(&model->acts, model->act_sizes);
-        printf0("allocated %d MiB for activations\n", (int)round(num_activations * sizeof(floatX) / (1024 * 1024)));
         // also create memory for caching inputs and targets
         cudaCheck(cudaMalloc((void**)&model->inputs, B * T * sizeof(int)));
         cudaCheck(cudaMalloc((void**)&model->targets, B * T * sizeof(int)));
@@ -2167,19 +2167,19 @@ void gpt2_backward(GPT2 *model) {
     // lazily allocate the memory for gradients of the weights and activations, if needed
     if (model->grads_memory == NULL) {
         // allocate buffers for weight gradients
+        printf0("allocating %d MiB for parameter gradients\n", (int)round(model->num_parameters * sizeof(floatX) / (1024 * 1024)));
         model->grads_memory = malloc_and_point_parameters(&model->grads, model->param_elements, model->param_sizeof);
-        printf0("allocated %d MiB for parameter gradients\n", (int)round(model->num_parameters * sizeof(floatX) / (1024 * 1024)));
         // we're going to be clever for the activations backward pass. we don't need to exactly
         // mirror the forward pass activations and we will save memory.
         size_t bw_act_sizes[NUM_ACTIVATION_TENSORS];
         fill_in_grad_act_sizes(bw_act_sizes, model->batch_size, model->seq_len, model->config);
         // count up and allocate the space
-        model->grads_acts_memory = malloc_and_point_backward(&model->grads_acts, bw_act_sizes);
         model->num_grad_acts = 0;
         for (size_t i = 0; i < NUM_BACKWARD_TENSORS; i++) {
             model->num_grad_acts += bw_act_sizes[i];
         }
-        printf0("allocated %d MiB for activation gradients\n", (int)round(model->num_grad_acts * sizeof(floatX) / (1024 * 1024)));
+        printf0("allocating %d MiB for activation gradients\n", (int)round(model->num_grad_acts * sizeof(floatX) / (1024 * 1024)));
+        model->grads_acts_memory = malloc_and_point_backward(&model->grads_acts, bw_act_sizes);
         // init gradients of parameters and activations to zero
         gpt2_zero_grad(model);
     }
@@ -2341,15 +2341,15 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
 
     if (model->m_memory == NULL) {
         size_t alloc_bytes = 2 * num_parameters * sizeof(float);
+        printf0("allocating %zu MiB for AdamW optimizer state\n", alloc_bytes >> 20);
         cudaCheck(cudaMalloc((void**)&model->m_memory, alloc_bytes));
         model->v_memory = model->m_memory + num_parameters;
         cudaCheck(cudaMemset(model->m_memory, 0, alloc_bytes));
-        printf0("allocated %zu MiB for AdamW optimizer state\n", alloc_bytes >> 20);
         if (model->use_master_weights == 1) {
+            printf0("allocating %zu MiB for master copy of params\n", (num_parameters * sizeof(float)) >> 20);
             cudaCheck(cudaMalloc((void**)&model->master_weights, num_parameters * sizeof(float)));
             copy_and_cast_kernel<<<CEIL_DIV(num_parameters, 512), 512>>>(model->master_weights, params_memory, num_parameters);
             cudaCheck(cudaGetLastError());
-            printf0("allocated %zu MiB for master copy of params\n", (num_parameters * sizeof(float)) >> 20);
         }
     }
 

From b24279c4db2beb9bb962cdd7b87782db5df7dd32 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Fri, 17 May 2024 01:15:47 +0300
Subject: [PATCH 091/172] remove duplicate workspace allocation

---
 train_gpt2.cu | 2 --
 1 file changed, 2 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 91fe1ca72..afc1f7ba0 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2409,8 +2409,6 @@ void common_start(bool override_enable_tf32 = true, bool print_device_info = tru
     bool enable_tf32 = PRECISION_MODE == PRECISION_FP32 && deviceProp.major >= 8 && override_enable_tf32;
     cublasCheck(cublasSetMathMode(cublas_handle, enable_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH));
     cublas_compute = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F;
-    // setup the (global) cuBLASLt workspace
-    cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size));
 
     create_cudnn();
 }

From c8fa7a8c63a110f0b74746210c5dfc18717758e7 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Fri, 17 May 2024 15:13:36 +0000
Subject: [PATCH 092/172] revert the adamw allocation to previous. minor

---
 train_gpt2.cu | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index afc1f7ba0..030c5c9b7 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2340,11 +2340,12 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
     floatX* grads_memory = (floatX*)model->grads_memory + multi_gpu_config->shard_offset;
 
     if (model->m_memory == NULL) {
-        size_t alloc_bytes = 2 * num_parameters * sizeof(float);
-        printf0("allocating %zu MiB for AdamW optimizer state\n", alloc_bytes >> 20);
-        cudaCheck(cudaMalloc((void**)&model->m_memory, alloc_bytes));
-        model->v_memory = model->m_memory + num_parameters;
-        cudaCheck(cudaMemset(model->m_memory, 0, alloc_bytes));
+        printf0("allocating %zu MiB for AdamW optimizer state m\n", (num_parameters * sizeof(float)) >> 20);
+        printf0("allocating %zu MiB for AdamW optimizer state v\n", (num_parameters * sizeof(float)) >> 20);
+        cudaCheck(cudaMalloc((void**)&model->m_memory, num_parameters * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&model->v_memory, num_parameters * sizeof(float)));
+        cudaCheck(cudaMemset(model->m_memory, 0, num_parameters * sizeof(float)));
+        cudaCheck(cudaMemset(model->v_memory, 0, num_parameters * sizeof(float)));
         if (model->use_master_weights == 1) {
             printf0("allocating %zu MiB for master copy of params\n", (num_parameters * sizeof(float)) >> 20);
             cudaCheck(cudaMalloc((void**)&model->master_weights, num_parameters * sizeof(float)));

From bf36a4b6b5be081bb675dca0224a61fcba542769 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Sat, 18 May 2024 13:27:35 +0300
Subject: [PATCH 093/172] improved numerical error checking:   tighter
 tolarances   relative tolerance based of bf16 epsilon   less verbose output
 if all is OK

---
 test_gpt2.cu | 60 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 36 insertions(+), 24 deletions(-)

diff --git a/test_gpt2.cu b/test_gpt2.cu
index 631357476..862a641e3 100644
--- a/test_gpt2.cu
+++ b/test_gpt2.cu
@@ -8,11 +8,15 @@ int check_tensor(float *a, float *b, int n, const char* label, float threshold=1
     int ok = 1;
     float max_diff = 0.0f;
     float max_rel_error = 0.0f;
+    float max_to_threshold = 0.f;
     float max_a = 0.0f;
     float max_b = 0.0f;
-    printf("%s\n", label);
+    float epsilon = 0.079;      // BF16 epsilon value
+    printf("%8s: ", label);
     for (int i = 0; i < n; i++) {
+        float t_eff = threshold + fabs(b[i]) * epsilon;
         float diff = fabsf(a[i] - b[i]);
+        max_to_threshold = max(max_to_threshold, diff / t_eff);
         if (diff > max_diff) {
             max_diff = diff;
             float denom = fabsf(b[i]);
@@ -20,21 +24,27 @@ int check_tensor(float *a, float *b, int n, const char* label, float threshold=1
             max_a = a[i];
             max_b = b[i];
         }
-        if (diff <= threshold) {
-            if (i < print_upto) { printf("OK "); }
-        } else {
-            if (i < print_upto) { printf("NOT OK "); }
+        if (diff > t_eff) {
             ok = 0;
         }
-        if (i < print_upto) { printf("%f %f\n", a[i], b[i]); }
     }
     // print the final result
     if (ok) {
-        printf("TENSOR OK, max diff: %e, with rel error: %e (calculated=%f, ref=%f)\n",
-                max_diff, max_rel_error, max_a, max_b);
+        printf("TENSOR OK, max diff: %.3e, with rel error: %.3e (calculated=%10f, ref=%10f), %.2f%% of maximum error\n",
+                max_diff, max_rel_error, max_a, max_b, max_to_threshold*100);
     } else {
-        printf("TENSOR NOT OK, max diff: %e, with rel error: %e (calculated=%f, ref=%f)\n",
-                max_diff, max_rel_error, max_a, max_b);
+        printf("TENSOR NOT OK, max diff: %.3e, with rel error: %.3e (calculated=%10f, ref=%10f), %.2f%% of maximum error\n",
+                max_diff, max_rel_error, max_a, max_b, max_to_threshold*100);
+    }
+
+    if(ok == 0) {
+        for (int i = 0; i < print_upto; i++) {
+            float t_eff = threshold + fabs(b[i]) * epsilon;
+            float diff = fabsf(a[i] - b[i]);
+            printf(diff <= threshold ? "OK " :  "NOT OK ");
+            printf("%f %f\n", a[i], b[i]);
+        }
+        printf("\n");
     }
     return ok;
 }
@@ -248,22 +258,24 @@ int main(int argc, char *argv[]) {
             // Also, if code changes and some of these get tripped, it could be ok if it's not by too much,
             // because our use of stochastic rounding is adding some non-determinism "pepper noise".
             // In that case it's ok to extend the tolerance by a bit, after a manual review.
-            allok = allok & check_tensor(tensors1[0], tensors2[0], V * C, "wte", 8e-1f);
-            allok = allok & check_tensor(tensors1[1], tensors2[1], maxT * C, "wpe", 1e-2f);
-            allok = allok & check_tensor(tensors1[2], tensors2[2], L * 3*C * C, "qkvw", 1.4e-1); // hmm a bit high
-            allok = allok & check_tensor(tensors1[3], tensors2[3], L * 3*C, "qkvb", 4e-2f);
-            allok = allok & check_tensor(tensors1[4], tensors2[4], L * C * C, "attprojw", 3e-2f);
+            // Also, different GPUs may use different matrix multiplication algorithms, so the
+            // actual errors can be hardware specific.
+            allok = allok & check_tensor(tensors1[0], tensors2[0], V * C, "wte", 4e-1f); // hmm a bit high
+            allok = allok & check_tensor(tensors1[1], tensors2[1], maxT * C, "wpe", 4e-3f);
+            allok = allok & check_tensor(tensors1[2], tensors2[2], L * 3*C * C, "qkvw", 1e-1); // hmm a bit high
+            allok = allok & check_tensor(tensors1[3], tensors2[3], L * 3*C, "qkvb", 3.5e-2f);
+            allok = allok & check_tensor(tensors1[4], tensors2[4], L * C * C, "attprojw", 2e-2f);
             allok = allok & check_tensor(tensors1[5], tensors2[5], L * C, "attprojb", 3e-2f);
-            allok = allok & check_tensor(tensors1[6], tensors2[6], L * 4*C * C, "fcw", 9e-2f); // hmm a bit high
-            allok = allok & check_tensor(tensors1[7], tensors2[7], L * 4*C, "fcb", 9e-2f); // hmm a bit high
-            allok = allok & check_tensor(tensors1[8], tensors2[8], L * C * 4*C, "fcprojw", 9e-2f); // hmm a bit high
-            allok = allok & check_tensor(tensors1[9], tensors2[9], L * C, "fcprojb", 3e-2f);
-            allok = allok & check_tensor(tensors1[10], tensors2[10], L * C, "ln1w", 0.1f); // hmm bit higher
-            allok = allok & check_tensor(tensors1[11], tensors2[11], L * C, "ln1b", 3e-2f);
-            allok = allok & check_tensor(tensors1[12], tensors2[12], L * C, "ln2w", 0.1f); // hmm bit higher
-            allok = allok & check_tensor(tensors1[13], tensors2[13], L * C, "ln2b", 3e-2f);
+            allok = allok & check_tensor(tensors1[6], tensors2[6], L * 4*C * C, "fcw", 5e-2f); // hmm a bit high
+            allok = allok & check_tensor(tensors1[7], tensors2[7], L * 4*C, "fcb", 5e-2f); // hmm a bit high
+            allok = allok & check_tensor(tensors1[8], tensors2[8], L * C * 4*C, "fcprojw", 5e-2f); // hmm a bit high
+            allok = allok & check_tensor(tensors1[9], tensors2[9], L * C, "fcprojb", 1.5e-2f);
+            allok = allok & check_tensor(tensors1[10], tensors2[10], L * C, "ln1w", 6e-4f);
+            allok = allok & check_tensor(tensors1[11], tensors2[11], L * C, "ln1b", 9e-3f);
+            allok = allok & check_tensor(tensors1[12], tensors2[12], L * C, "ln2w", 2e-3f);
+            allok = allok & check_tensor(tensors1[13], tensors2[13], L * C, "ln2b", 2.5e-3f);
             allok = allok & check_tensor(tensors1[14], tensors2[14], C, "lnfw", 0.12f); // hmm bit higher
-            allok = allok & check_tensor(tensors1[15], tensors2[15], C, "lnfb", 3e-2f);
+            allok = allok & check_tensor(tensors1[15], tensors2[15], C, "lnfb", 2e-2f);
         }
 
         gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.01f, step+1, &multi_gpu_config);

From 4374360015c93143acc261d96e951f4be8e41330 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sat, 18 May 2024 18:34:24 +0000
Subject: [PATCH 094/172] adjust wte upper bound a bit, and print always
 because this part is really tricky and i don't trust anything other than
 manual inspection, even if we pass, allegedly

---
 test_gpt2.cu | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/test_gpt2.cu b/test_gpt2.cu
index 862a641e3..84701b039 100644
--- a/test_gpt2.cu
+++ b/test_gpt2.cu
@@ -12,7 +12,8 @@ int check_tensor(float *a, float *b, int n, const char* label, float threshold=1
     float max_a = 0.0f;
     float max_b = 0.0f;
     float epsilon = 0.079;      // BF16 epsilon value
-    printf("%8s: ", label);
+    printf("---\n");
+    printf("checking tensor: %s\n", label);
     for (int i = 0; i < n; i++) {
         float t_eff = threshold + fabs(b[i]) * epsilon;
         float diff = fabsf(a[i] - b[i]);
@@ -27,6 +28,11 @@ int check_tensor(float *a, float *b, int n, const char* label, float threshold=1
         if (diff > t_eff) {
             ok = 0;
         }
+        // print the first few elements so we can visually assess the "proof" of the comparison
+        if (i < print_upto) {
+            printf(diff <= t_eff ? "OK " :  "NOT OK ");
+            printf("%f %f\n", a[i], b[i]);
+        }
     }
     // print the final result
     if (ok) {
@@ -36,16 +42,6 @@ int check_tensor(float *a, float *b, int n, const char* label, float threshold=1
         printf("TENSOR NOT OK, max diff: %.3e, with rel error: %.3e (calculated=%10f, ref=%10f), %.2f%% of maximum error\n",
                 max_diff, max_rel_error, max_a, max_b, max_to_threshold*100);
     }
-
-    if(ok == 0) {
-        for (int i = 0; i < print_upto; i++) {
-            float t_eff = threshold + fabs(b[i]) * epsilon;
-            float diff = fabsf(a[i] - b[i]);
-            printf(diff <= threshold ? "OK " :  "NOT OK ");
-            printf("%f %f\n", a[i], b[i]);
-        }
-        printf("\n");
-    }
     return ok;
 }
 
@@ -260,7 +256,7 @@ int main(int argc, char *argv[]) {
             // In that case it's ok to extend the tolerance by a bit, after a manual review.
             // Also, different GPUs may use different matrix multiplication algorithms, so the
             // actual errors can be hardware specific.
-            allok = allok & check_tensor(tensors1[0], tensors2[0], V * C, "wte", 4e-1f); // hmm a bit high
+            allok = allok & check_tensor(tensors1[0], tensors2[0], V * C, "wte", 6e-1f); // hmm a bit high
             allok = allok & check_tensor(tensors1[1], tensors2[1], maxT * C, "wpe", 4e-3f);
             allok = allok & check_tensor(tensors1[2], tensors2[2], L * 3*C * C, "qkvw", 1e-1); // hmm a bit high
             allok = allok & check_tensor(tensors1[3], tensors2[3], L * 3*C, "qkvb", 3.5e-2f);

From 44d45bdd6a2d6ded079ea1fe762b61bb0889faba Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Wed, 1 May 2024 04:24:54 +0300
Subject: [PATCH 095/172] first draft for gradient clipping by global norm

---
 profile_gpt2cu.py |  2 ++
 train_gpt2.cu     | 83 +++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 83 insertions(+), 2 deletions(-)

diff --git a/profile_gpt2cu.py b/profile_gpt2cu.py
index d9dbd4f8e..4113d7819 100644
--- a/profile_gpt2cu.py
+++ b/profile_gpt2cu.py
@@ -50,6 +50,8 @@
 # model config
 CLS_START = -1
 CLS_NUM = 6
+NORM_ID = 44
+ADAM_ID = 45
 N_LAYERS = 12
 
 summaries = defaultdict(lambda: 0.0)
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 030c5c9b7..cfec532c1 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -1151,11 +1151,31 @@ __device__ float lerp(float start, float end, float weight) {
 template <typename Tp, typename Tg>
 __global__ void adamw_kernel3(Tp* params_memory, float* master_params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters,
                               float learning_rate, float beta1, float beta2, float beta1_correction, float beta2_correction, float eps, float weight_decay,
+                              float* grad_norm, float max_grad_norm,
                               unsigned int seed) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= num_parameters) { return; }  // guard
+
+    float scale = 1.f;
+    if(!isfinite(*grad_norm)) {
+        // if we had a numerical problem (e.g, overflow)
+        // in our gradient calculation, don't mess up the
+        // existing weights.
+        // TODO increase a global counter somewhere so we actually know if/how often this happens
+        if(threadIdx.x == 0 &&  blockIdx.x == 0) {
+            printf("[WARNING] weight update skipped due to non-finite gradients!\n");
+        }
+        return;
+    }
+    if(*grad_norm > max_grad_norm) {
+        scale = max_grad_norm / *grad_norm;
+        // TODO just for debugging, remove this
+        if(threadIdx.x == 0 &&  blockIdx.x == 0) {
+            printf("[scale %f]\n", scale);
+        }
+    }
     // get the gradient, m, and v for this parameter
-    float grad = (float)grads_memory[idx];
+    float grad = scale * (float)grads_memory[idx];
     float m = m_memory[idx];
     float v = v_memory[idx];
     // update the first moment (momentum)
@@ -1180,6 +1200,40 @@ __global__ void adamw_kernel3(Tp* params_memory, float* master_params_memory, Tg
     if (master_params_memory != NULL) { master_params_memory[idx] = param; }
 }
 
+template<class T>
+__global__ void norm_kernel(float* out, const T* data, size_t count) {
+    // we want as few atomics as possible, so each block tries to do
+    // the maximum amount of work (so no fixed chunk, but instead iterating
+    // until we run out of data), and then we reduce inside the block
+    // and finally have just one atomic per block.
+    // TODO write a second version that just spams atomics in dev/cuda,
+    // often they are surprisingly fast
+    namespace cg = cooperative_groups;
+    cg::thread_block block = cg::this_thread_block();
+    cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
+
+    __shared__ float block_result[32];
+
+    // out will be updated atomically from all thread blocks
+    size_t index = threadIdx.x + blockDim.x * blockIdx.x;
+    size_t grid_width = blockDim.x * gridDim.x;
+    float accumulator = 0.f;
+    for(size_t i = index; i < count; i += grid_width) {
+        accumulator += (float)data[i] * (float)data[i];
+    }
+    // warp-level reduce
+    float warp_result = cg::reduce(warp, accumulator, cg::plus<float>{});
+    block_result[warp.meta_group_rank()] = warp_result;
+    block.sync();
+    if(warp.meta_group_rank() == 0) {
+        float gather = warp.thread_rank() < warp.meta_group_size() ? block_result[warp.thread_rank()] : 0.f;
+        float block_sum = cg::reduce(warp, gather, cg::plus<float>{});
+        if(warp.thread_rank() ==  0) {
+            atomicAdd(out, block_sum);
+        }
+    }
+}
+
 struct SoftmaxParams {
     float Scale;
     float Offset;
@@ -1656,6 +1710,20 @@ void fused_classifier(Type* logits, Type* losses,
     cudaCheck(cudaGetLastError());
 }
 
+template<typename T>
+void norm(float* out, const T* values, size_t count) {
+    const int block_size = 512;
+    // launch just enough blocks to fill the grid. deliberately no DIV_CEIL.
+    // having one block less than possible is a tiny performance hit, having
+    // one block too many is catastrophic, since it only can start once all the other
+    // blocks finish. anyway, I think cuda_threads_per_SM should be a multiple of 512
+    // on all gpus, so the division really is going to be exact.
+    const int grid_size = cuda_threads_per_SM * cuda_num_SMs / block_size;
+    assert(grid_size > 0);      // gives a better error than letting the call below fail
+    norm_kernel<<<grid_size, 512>>>(out, values, count);
+    cudaCheck(cudaGetLastError());
+}
+
 // ----------------------------------------------------------------------------
 // GPT-2 model definition
 
@@ -2354,14 +2422,25 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
         }
     }
 
+    // repurposing this buffer. We calculate the gradient norm on the GPU, and need it in the next kernel,
+    // so we _really_ don't want to transfer it here as an actual float. So we just pass around a pointer
+    // to this memory that is not otherwise needed during the update phase.
+    float* grad_norm = (float*)model->acts.output;
+
+    // global gradient norm
+    norm(grad_norm, (floatX*)model->grads_memory, model->num_parameters);
+
     int block_size = 512;
     int num_blocks = CEIL_DIV(num_parameters, block_size);
     float beta1_correction = 1.0f - powf(beta1, t);
     float beta2_correction = 1.0f - powf(beta2, t);
+    float max_grad_norm = 1.f;  // TODO figure out a good value
     unsigned int seed = random_u32(&model->rng_state);
     adamw_kernel3<<<num_blocks, block_size>>>(params_memory, model->master_weights, grads_memory,
                                                               model->m_memory, model->v_memory, num_parameters,
-                                                              learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed);
+                                                              learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay,
+                                              grad_norm, max_grad_norm,
+                                              seed);
     cudaCheck(cudaGetLastError());
 }
 

From d7a81ef26fdd2b67f56d922612e9942927ee2ebd Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Wed, 1 May 2024 13:35:54 +0300
Subject: [PATCH 096/172] added a useful mixed precision utility for dev/cuda

---
 dev/cuda/Makefile       |   3 +-
 dev/cuda/global_norm.cu | 199 ++++++++++++++++++++++++++++++++++++++++
 train_gpt2.cu           |   8 +-
 3 files changed, 204 insertions(+), 6 deletions(-)
 create mode 100644 dev/cuda/global_norm.cu

diff --git a/dev/cuda/Makefile b/dev/cuda/Makefile
index c74178851..14eae201e 100644
--- a/dev/cuda/Makefile
+++ b/dev/cuda/Makefile
@@ -18,7 +18,7 @@ MPI_PATHS = -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-
 	$(NVCC) $(CFLAGS) $(NVCCFLAGS) $< -o $@
 
 # Build all targets
-TARGETS = adamw attention_backward attention_forward classifier_fused crossentropy_forward crossentropy_softmax_backward encoder_backward encoder_forward gelu_backward gelu_forward layernorm_backward layernorm_forward matmul_backward matmul_backward_bias matmul_forward nccl_all_reduce residual_forward softmax_forward trimat_forward fused_residual_forward
+TARGETS = adamw attention_backward attention_forward classifier_fused crossentropy_forward crossentropy_softmax_backward encoder_backward encoder_forward gelu_backward gelu_forward layernorm_backward layernorm_forward matmul_backward matmul_backward_bias matmul_forward nccl_all_reduce residual_forward softmax_forward trimat_forward fused_residual_forward  global_norm
 all: $(TARGETS)
 
 # Individual targets: forward pass
@@ -48,6 +48,7 @@ matmul_backward: matmul_backward.cu
 
 # Update kernels
 adamw: adamw.cu
+global_norm: global_norm.cu
 
 # NCCL communication kernels
 nccl_all_reduce: nccl_all_reduce.cu
diff --git a/dev/cuda/global_norm.cu b/dev/cuda/global_norm.cu
new file mode 100644
index 000000000..434c343f2
--- /dev/null
+++ b/dev/cuda/global_norm.cu
@@ -0,0 +1,199 @@
+/*
+Kernels for a global norm.
+Global norm in this context means that we want to calculate a single norm cooperatively using all avalailable SMs, instead
+ of multiple norms that can be handled by separate blocks.
+
+Compile example:
+nvcc -O3 --use_fast_math global_norm.cu -o global_norm
+
+version 1 uses as few blocks as possible to still fill the GPU, and only does atomic adds in the end
+./gelu_forward 1
+
+version 2 is the same but with only warp-wide reduction inside the kernel, and more global atomics
+./gelu_forward 2
+*/
+
+#include "common.h"
+#include <assert.h>
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+
+// TODO move this into common.h
+// turn on bf16 as default, done up here for now
+#define ENABLE_BF16
+
+#if defined(ENABLE_BF16)
+typedef __nv_bfloat16 floatX;
+typedef __nv_bfloat16 floatN;
+#elif defined(ENABLE_FP16)
+typedef half floatX;
+typedef half floatN;
+#else
+typedef float floatX;
+typedef float floatN;
+#endif
+
+typedef Packed128<floatX> x128;
+
+float global_norm_cpu(const float* data, size_t count) {
+    // accumulate in double so we have an accurate numerical reference
+    double acc = 0.0;
+    for(size_t i = 0; i < count; ++i) {
+        acc  += (double)data[i] * (double)data[i];
+    }
+    return (float)acc;
+}
+
+
+template<class T>
+__global__ void norm_kernel1(float* out, const T* data, size_t count) {
+    // we want as few atomics as possible, so each block tries to do
+    // the maximum amount of work (so no fixed chunk, but instead iterating
+    // until we run out of data), and then we reduce inside the block
+    // and finally have just one atomic per block.
+    namespace cg = cooperative_groups;
+    cg::thread_block block = cg::this_thread_block();
+    cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
+
+    __shared__ float block_result[32];
+
+    // out will be updated atomically from all thread blocks
+    size_t index = threadIdx.x + blockDim.x * blockIdx.x;
+    size_t grid_width = blockDim.x * gridDim.x;
+    float accumulator = 0.f;
+    for(size_t i = index; i < count; i += grid_width) {
+        accumulator += (float)data[i] * (float)data[i];
+    }
+    // warp-level reduce
+    float warp_result = cg::reduce(warp, accumulator, cg::plus<float>{});
+    block_result[warp.meta_group_rank()] = warp_result;
+    block.sync();
+    if(warp.meta_group_rank() == 0) {
+        float gather = warp.thread_rank() < warp.meta_group_size() ? block_result[warp.thread_rank()] : 0.f;
+        float block_sum = cg::reduce(warp, gather, cg::plus<float>{});
+        if(warp.thread_rank() ==  0) {
+            atomicAdd(out, block_sum);
+        }
+    }
+}
+
+
+
+template<class T>
+__global__ void norm_kernel2(float* out, const T* data, size_t count) {
+    // no shared memory; but one atomic per warp instead of per block
+    namespace cg = cooperative_groups;
+    cg::thread_block block = cg::this_thread_block();
+    cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
+
+    // out will be updated atomically from all thread blocks
+    size_t index = threadIdx.x + blockDim.x * blockIdx.x;
+    size_t grid_width = blockDim.x * gridDim.x;
+    float accumulator = 0.f;
+    for(size_t i = index; i < count; i += grid_width) {
+        accumulator += (float)data[i] * (float)data[i];
+    }
+
+    // warp-level reduce
+    float warp_result = cg::reduce(warp, accumulator, cg::plus<float>{});
+    // and atomic in global buffer
+    if(warp.thread_rank() == 0) {
+        atomicAdd(out, warp_result);
+    }
+}
+
+
+
+template<typename T>
+void global_norm1(float* out, const T* values, size_t count, int block_size) {
+    // launch just enough blocks to fill the grid. deliberately no DIV_CEIL.
+    // having one block less than possible is a tiny performance hit, having
+    // one block too many is catastrophic, since it only can start once all the other
+    // blocks finish. anyway, I think cuda_threads_per_SM should be a multiple of 512
+    // on all gpus, so the division really is going to be exact.
+    const int grid_size = cuda_threads_per_SM * cuda_num_SMs / block_size;
+    assert(grid_size > 0);      // gives a better error than letting the call below fail
+    norm_kernel1<<<grid_size, block_size>>>(out, values, count);
+    cudaCheck(cudaGetLastError());
+}
+
+template<typename T>
+void global_norm2(float* out, const T* values, size_t count, int block_size) {
+    // ditto
+    const int grid_size = cuda_threads_per_SM * cuda_num_SMs / block_size;
+    assert(grid_size > 0);      // gives a better error than letting the call below fail
+    norm_kernel2<<<grid_size, block_size>>>(out, values, count);
+    cudaCheck(cudaGetLastError());
+}
+
+void global_norm(int kernel_num, float* out, const floatX* values, size_t count, int block_size) {
+    switch (kernel_num) {
+        case 1:
+            return global_norm1(out, values, count, block_size);
+        case 2:
+            return global_norm2(out, values, count, block_size);
+    }
+}
+
+int main(int argc, const char **argv) {
+    setup_main();
+
+    int C = 768;
+    int L = 12;
+
+    size_t num_params = (size_t)(C * 4*C + C*C) * 2 * L;
+
+    // create host memory of random numbers
+    float* inp = make_random_float(num_params);
+    // scale them down
+    for(size_t i = 0; i < num_params; ++i) {
+        inp[i] *= 1e-3;
+    }
+
+    // read kernel_num from command line
+    int kernel_num = 1;
+    if (argc > 1) {
+        kernel_num = atoi(argv[1]);
+    }
+    printf("Using kernel %d\n", kernel_num);
+
+    // first check the correctness of the kernel
+    float out = global_norm_cpu(inp, num_params);
+
+    // move to GPU
+    float* d_out;
+    floatX* d_inp;
+    cudaCheck(cudaMalloc(&d_out,  sizeof(float)));
+    cudaCheck(cudaMalloc(&d_inp, num_params * sizeof(floatX)));
+    cudaCheck(memcpy_convert(d_inp, inp, num_params));
+
+    int block_sizes[] = {32, 64, 128, 256, 512, 1024};
+    for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
+        int block_size = block_sizes[j];
+        printf("Checking block size %d.\n", block_size);
+        cudaCheck(cudaMemset(d_out, 0, sizeof(float)));
+        global_norm(kernel_num, d_out, d_inp, num_params, block_size);
+        validate_result(d_out, &out, "out", 1, 1e-2f);
+    }
+
+    printf("All results match. Starting benchmarks.\n\n");
+
+    for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
+        int block_size = block_sizes[j];
+
+        int repeat_times = 1000;
+
+        float elapsed_time = benchmark_kernel(repeat_times, global_norm,
+                                              kernel_num, d_out, d_inp,
+                                              num_params, block_size);
+        size_t memory_ops = num_params * sizeof(floatX);
+        float memory_bandwidth = memory_ops / elapsed_time / 1e6;
+
+        printf("block_size %4d | time %.4f ms | bandwidth %.2f GB/s\n", block_size, elapsed_time, memory_bandwidth);
+    }
+
+    // free memory
+    free(inp);
+    cudaCheck(cudaFree(d_out));
+    cudaCheck(cudaFree(d_inp));
+}
\ No newline at end of file
diff --git a/train_gpt2.cu b/train_gpt2.cu
index cfec532c1..cb006285c 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -1206,8 +1206,6 @@ __global__ void norm_kernel(float* out, const T* data, size_t count) {
     // the maximum amount of work (so no fixed chunk, but instead iterating
     // until we run out of data), and then we reduce inside the block
     // and finally have just one atomic per block.
-    // TODO write a second version that just spams atomics in dev/cuda,
-    // often they are surprisingly fast
     namespace cg = cooperative_groups;
     cg::thread_block block = cg::this_thread_block();
     cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
@@ -1711,7 +1709,7 @@ void fused_classifier(Type* logits, Type* losses,
 }
 
 template<typename T>
-void norm(float* out, const T* values, size_t count) {
+void global_norm(float* out, const T* values, size_t count) {
     const int block_size = 512;
     // launch just enough blocks to fill the grid. deliberately no DIV_CEIL.
     // having one block less than possible is a tiny performance hit, having
@@ -1720,7 +1718,7 @@ void norm(float* out, const T* values, size_t count) {
     // on all gpus, so the division really is going to be exact.
     const int grid_size = cuda_threads_per_SM * cuda_num_SMs / block_size;
     assert(grid_size > 0);      // gives a better error than letting the call below fail
-    norm_kernel<<<grid_size, 512>>>(out, values, count);
+    norm_kernel<<<grid_size, block_size>>>(out, values, count);
     cudaCheck(cudaGetLastError());
 }
 
@@ -2428,7 +2426,7 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
     float* grad_norm = (float*)model->acts.output;
 
     // global gradient norm
-    norm(grad_norm, (floatX*)model->grads_memory, model->num_parameters);
+    global_norm(grad_norm, (floatX*)model->grads_memory, model->num_parameters);
 
     int block_size = 512;
     int num_blocks = CEIL_DIV(num_parameters, block_size);

From a9947a8315101778de657b2e597c1a2ffc09939e Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Fri, 3 May 2024 00:39:06 +0300
Subject: [PATCH 097/172] added a flag and the missing sqrt; testing now has
 clipping enabled

---
 test_gpt2.cu  | 22 +++++++++++-----------
 train_gpt2.cu | 14 ++++++++------
 train_gpt2.py |  6 ++++--
 3 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/test_gpt2.cu b/test_gpt2.cu
index 84701b039..50a291f18 100644
--- a/test_gpt2.cu
+++ b/test_gpt2.cu
@@ -274,7 +274,7 @@ int main(int argc, char *argv[]) {
             allok = allok & check_tensor(tensors1[15], tensors2[15], C, "lnfb", 2e-2f);
         }
 
-        gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.01f, step+1, &multi_gpu_config);
+        gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.01f, 1.f, step+1, &multi_gpu_config);
 
         // print the timing information at the end
         printf("step %d: loss %f (took %f ms)\n", step+1, model.mean_loss, time_elapsed_s * 1000);
@@ -283,16 +283,16 @@ int main(int argc, char *argv[]) {
 
     // expected losses are as follows, from Python
     float expected_losses[10] = {
-        5.270007133483887,
-        4.059706687927246,
-        3.3751230239868164,
-        2.8007826805114746,
-        2.315382242202759,
-        1.8490285873413086,
-        1.3946564197540283,
-        0.9991465210914612,
-        0.6240804195404053,
-        0.37651097774505615
+        5.2700,
+        4.0607,
+        3.3166,
+        2.7115,
+        2.1702,
+        1.6349,
+        1.1419,
+        0.7038,
+        0.3769,
+        0.1743
     };
 
     // compare
diff --git a/train_gpt2.cu b/train_gpt2.cu
index cb006285c..088afd69c 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -1168,10 +1168,10 @@ __global__ void adamw_kernel3(Tp* params_memory, float* master_params_memory, Tg
         return;
     }
     if(*grad_norm > max_grad_norm) {
-        scale = max_grad_norm / *grad_norm;
+        scale = max_grad_norm / sqrtf(*grad_norm);
         // TODO just for debugging, remove this
         if(threadIdx.x == 0 &&  blockIdx.x == 0) {
-            printf("[scale %f]\n", scale);
+            printf("[norm %f]\n", sqrtf(*grad_norm));
         }
     }
     // get the gradient, m, and v for this parameter
@@ -2399,7 +2399,7 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) {
 #endif
 }
 
-void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t, MultiGpuConfig* multi_gpu_config) {
+void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, float grad_clipping, int t, MultiGpuConfig* multi_gpu_config) {
     NVTX_RANGE_FN();
     size_t num_parameters = multi_gpu_config->shard_num_parameters;
     floatX* params_memory = (floatX*)model->params_memory + multi_gpu_config->shard_offset;
@@ -2432,12 +2432,11 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
     int num_blocks = CEIL_DIV(num_parameters, block_size);
     float beta1_correction = 1.0f - powf(beta1, t);
     float beta2_correction = 1.0f - powf(beta2, t);
-    float max_grad_norm = 1.f;  // TODO figure out a good value
     unsigned int seed = random_u32(&model->rng_state);
     adamw_kernel3<<<num_blocks, block_size>>>(params_memory, model->master_weights, grads_memory,
                                                               model->m_memory, model->v_memory, num_parameters,
                                                               learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay,
-                                              grad_norm, max_grad_norm,
+                                              grad_norm, grad_clipping,
                                               seed);
     cudaCheck(cudaGetLastError());
 }
@@ -2684,6 +2683,7 @@ int main(int argc, char *argv[]) {
     int use_master_weights = 1;
     int recompute = 1; // recompute during backward setting, 0 = none, 1 = recompute gelu
     int zero_stage = 0; // Zero Optimization Stage for Multi-GPU training
+    float grad_clipping  = 1.f;
     for (int i = 1; i < argc; i+=2) {
         if (i + 1 >= argc) { error_usage(); } // must have arg after flag
         if (argv[i][0] != '-') { error_usage(); } // must start with dash
@@ -2704,6 +2704,7 @@ int main(int argc, char *argv[]) {
         else if (argv[i][1] == 'a') { overfit_single_batch = atoi(argv[i+1]); }
         else if (argv[i][1] == 'f') { override_enable_tf32 = atoi(argv[i+1]); }
         else if (argv[i][1] == 'w') { use_master_weights = atoi(argv[i+1]); }
+        else if (argv[i][1] == 'c') { grad_clipping = atof(argv[i+1]); }
         else if (argv[i][1] == 'z') { zero_stage = atoi(argv[i+1]); }
         else if (argv[i][1] == 'r') { recompute = atoi(argv[i+1]); }
         else { error_usage(); }
@@ -2719,6 +2720,7 @@ int main(int argc, char *argv[]) {
     printf0("| sequence length T     | %-50d |\n", T);
     printf0("| total batch size      | %-50d |\n", total_batch_size);
     printf0("| learning rate         | %-50e |\n", learning_rate);
+    printf0("| grad_clipping         | %-50e |\n", grad_clipping);
     printf0("| max_steps             | %-50d |\n", max_steps);
     printf0("| val_loss_every        | %-50d |\n", val_loss_every);
     printf0("| val_max_batches       | %-50d |\n", val_max_batches);
@@ -2903,7 +2905,7 @@ int main(int argc, char *argv[]) {
         model.mean_loss = lossf;
         // update the parameters
         gpt2_multi_gpu_accumulate(&model, &multi_gpu_config);
-        gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1, &multi_gpu_config);
+        gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, grad_clipping, step+1, &multi_gpu_config);
         gpt2_multi_gpu_gather(&model, &multi_gpu_config);
         // zero out the gradients for the next iteration
         gpt2_zero_grad(&model);
diff --git a/train_gpt2.py b/train_gpt2.py
index c50fc7b61..b57fe432c 100644
--- a/train_gpt2.py
+++ b/train_gpt2.py
@@ -407,6 +407,7 @@ def print0(*args, **kwargs):
     parser.add_argument("--batch_size", type=int, default=4, help="batch size, in units of #batch dimensions")
     parser.add_argument("--sequence_length", type=int, default=64, help="sequence length")
     parser.add_argument("--total_batch_size", type=int, default=256, help="total desired batch size, in units of #tokens")
+    parser.add_argument("--grad_clipping", type=float, default=1, help="maximum gradient magnitude")
     args = parser.parse_args()
     B, T = args.batch_size, args.sequence_length
     assert 1 <= T <= 1024
@@ -552,6 +553,7 @@ def get_batch():
     if device == "cuda":
         torch.cuda.reset_peak_memory_stats()
     timings = []
+    norm = -1   # dummy value to print in inference-only mode
     for step in range(args.num_iterations):
         t0 = time.time()
 
@@ -575,7 +577,7 @@ def get_batch():
             # backward pass
             if not args.inference_only:
                 loss.backward()
-        # todo: grad clip here
+        norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clipping)
         optimizer.step()
         optimizer.zero_grad(set_to_none=True)
 
@@ -588,7 +590,7 @@ def get_batch():
         t1 = time.time()
         # the 0th iteration is often an outlier (much slower) => skip logging it
         tokens_per_second = grad_accum_steps * ddp_world_size * B * T / (t1-t0)
-        print0(f"iteration {step+1}, loss: {lossf:.4f}, time: {(t1-t0)*1000:.3f}ms, tok/s: {tokens_per_second:.2f}")
+        print0(f"iteration {step+1}, loss: {lossf:.4f}, time: {(t1-t0)*1000:.3f}ms, tok/s: {tokens_per_second:.2f}, norm: {norm:.3f}")
         if step > 0 and step > args.num_iterations - 20:
             timings.append(t1-t0)
 

From c3a3b9daa526eb5cc02b05aaea7451ed3c4f0d53 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Fri, 3 May 2024 01:08:04 +0300
Subject: [PATCH 098/172] fixed profile target

---
 profile_gpt2.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/profile_gpt2.cu b/profile_gpt2.cu
index 4b24c8973..5a6764533 100644
--- a/profile_gpt2.cu
+++ b/profile_gpt2.cu
@@ -54,7 +54,7 @@ int main(int argc, char *argv[]) {
     gpt2_forward(&model, x, y, B, T);
     gpt2_zero_grad(&model);
     gpt2_backward(&model);
-    gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1, &multi_gpu_config);
+    gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1.f, 1, &multi_gpu_config);
     cudaCheck(cudaDeviceSynchronize()); // finish all CUDA work to get correct precise timings
 
     // free

From 589ead1e4ffd65db4f2ef2d5a8ea306bd06cd9e9 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Wed, 8 May 2024 00:13:09 +0200
Subject: [PATCH 099/172] updated code to adapt to latest changes

---
 profile_gpt2cu.py |  4 +---
 train_gpt2.cu     | 27 ++++++++-------------------
 2 files changed, 9 insertions(+), 22 deletions(-)

diff --git a/profile_gpt2cu.py b/profile_gpt2cu.py
index 4113d7819..de2edfda9 100644
--- a/profile_gpt2cu.py
+++ b/profile_gpt2cu.py
@@ -50,8 +50,6 @@
 # model config
 CLS_START = -1
 CLS_NUM = 6
-NORM_ID = 44
-ADAM_ID = 45
 N_LAYERS = 12
 
 summaries = defaultdict(lambda: 0.0)
@@ -132,7 +130,7 @@
         # the classifier part, counts only once
         pass_name = "cls"
         phase = "bwd"
-    elif "adamw" in kernel:
+    elif "adamw" in kernel or "global_norm" in kernel:
         # encoder layer or adam
         pass_name = "opt"
     # before the first optimizer run, we create weight copies.
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 088afd69c..05a0ba2fc 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -1201,18 +1201,13 @@ __global__ void adamw_kernel3(Tp* params_memory, float* master_params_memory, Tg
 }
 
 template<class T>
-__global__ void norm_kernel(float* out, const T* data, size_t count) {
+__global__ void global_norm_kernel(float* out, const T* data, size_t count) {
     // we want as few atomics as possible, so each block tries to do
     // the maximum amount of work (so no fixed chunk, but instead iterating
     // until we run out of data), and then we reduce inside the block
     // and finally have just one atomic per block.
-    namespace cg = cooperative_groups;
-    cg::thread_block block = cg::this_thread_block();
-    cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
-
-    __shared__ float block_result[32];
-
-    // out will be updated atomically from all thread blocks
+    // out will be updated atomically from all thread blocks. It is a float, so the
+    // atomic op is unproblematic
     size_t index = threadIdx.x + blockDim.x * blockIdx.x;
     size_t grid_width = blockDim.x * gridDim.x;
     float accumulator = 0.f;
@@ -1220,15 +1215,9 @@ __global__ void norm_kernel(float* out, const T* data, size_t count) {
         accumulator += (float)data[i] * (float)data[i];
     }
     // warp-level reduce
-    float warp_result = cg::reduce(warp, accumulator, cg::plus<float>{});
-    block_result[warp.meta_group_rank()] = warp_result;
-    block.sync();
-    if(warp.meta_group_rank() == 0) {
-        float gather = warp.thread_rank() < warp.meta_group_size() ? block_result[warp.thread_rank()] : 0.f;
-        float block_sum = cg::reduce(warp, gather, cg::plus<float>{});
-        if(warp.thread_rank() ==  0) {
-            atomicAdd(out, block_sum);
-        }
+    float block_sum = blockReduce<warpReduceSum>(accumulator);
+    if(threadIdx.x == 0) {
+        atomicAdd(out, block_sum);
     }
 }
 
@@ -1716,9 +1705,9 @@ void global_norm(float* out, const T* values, size_t count) {
     // one block too many is catastrophic, since it only can start once all the other
     // blocks finish. anyway, I think cuda_threads_per_SM should be a multiple of 512
     // on all gpus, so the division really is going to be exact.
-    const int grid_size = cuda_threads_per_SM * cuda_num_SMs / block_size;
+    const int grid_size = deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount / block_size;
     assert(grid_size > 0);      // gives a better error than letting the call below fail
-    norm_kernel<<<grid_size, block_size>>>(out, values, count);
+    global_norm_kernel<<<grid_size, block_size>>>(out, values, count);
     cudaCheck(cudaGetLastError());
 }
 

From 66ce5766e004f9eec34363f4eb8480b3a305ebc6 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Sat, 18 May 2024 23:06:26 +0300
Subject: [PATCH 100/172] fixed up dev/cuda

---
 dev/cuda/global_norm.cu | 24 +++---------------------
 1 file changed, 3 insertions(+), 21 deletions(-)

diff --git a/dev/cuda/global_norm.cu b/dev/cuda/global_norm.cu
index 434c343f2..2295c4976 100644
--- a/dev/cuda/global_norm.cu
+++ b/dev/cuda/global_norm.cu
@@ -5,35 +5,17 @@ Global norm in this context means that we want to calculate a single norm cooper
 
 Compile example:
 nvcc -O3 --use_fast_math global_norm.cu -o global_norm
-
-version 1 uses as few blocks as possible to still fill the GPU, and only does atomic adds in the end
-./gelu_forward 1
-
-version 2 is the same but with only warp-wide reduction inside the kernel, and more global atomics
-./gelu_forward 2
 */
 
-#include "common.h"
+
 #include <assert.h>
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
 
-// TODO move this into common.h
 // turn on bf16 as default, done up here for now
 #define ENABLE_BF16
+#include "common.h"
 
-#if defined(ENABLE_BF16)
-typedef __nv_bfloat16 floatX;
-typedef __nv_bfloat16 floatN;
-#elif defined(ENABLE_FP16)
-typedef half floatX;
-typedef half floatN;
-#else
-typedef float floatX;
-typedef float floatN;
-#endif
-
-typedef Packed128<floatX> x128;
 
 float global_norm_cpu(const float* data, size_t count) {
     // accumulate in double so we have an accurate numerical reference
@@ -167,7 +149,7 @@ int main(int argc, const char **argv) {
     cudaCheck(cudaMalloc(&d_inp, num_params * sizeof(floatX)));
     cudaCheck(memcpy_convert(d_inp, inp, num_params));
 
-    int block_sizes[] = {32, 64, 128, 256, 512, 1024};
+    int block_sizes[] = {32, 64, 128, 256, 512, 768, 1024};
     for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
         int block_size = block_sizes[j];
         printf("Checking block size %d.\n", block_size);

From 77b991281ff9db57fa4f8615d2431e76d59ec3fd Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sun, 19 May 2024 11:51:01 +0000
Subject: [PATCH 101/172] add hellaswag reference eval. it looks pretty bad
 honestly, the examples themselves, but i'm told it is predictive... hmm

---
 hellaswag.py | 160 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 160 insertions(+)
 create mode 100644 hellaswag.py

diff --git a/hellaswag.py b/hellaswag.py
new file mode 100644
index 000000000..a0fea2afa
--- /dev/null
+++ b/hellaswag.py
@@ -0,0 +1,160 @@
+"""
+Downloads and evaluates HellaSwag in Python.
+This then acts as the reference file for llm.c
+https://github.com/rowanz/hellaswag
+
+Example HellaSwag json item:
+
+{"ind": 24, "activity_label": "Roof shingle removal", "ctx_a": "A man is sitting on a roof.", "ctx_b": "he", "ctx": "A man is sitting on a roof. he", "split": "val", "split_type": "indomain", "label": 3, "endings": ["is using wrap to wrap a pair of skis.", "is ripping level tiles off.", "is holding a rubik's cube.", "starts pulling up roofing on a roof."], "source_id": "activitynet~v_-JhWjGDPHMY"}
+
+ind: dataset ID
+activity_label: The ActivityNet or WikiHow label for this example
+context: There are two formats. The full context is in ctx. When the context ends in an (incomplete) noun phrase, like for ActivityNet, this incomplete noun phrase is in ctx_b, and the context up until then is in ctx_a. This can be useful for models such as BERT that need the last sentence to be complete. However, it's never required. If ctx_b is nonempty, then ctx is the same thing as ctx_a, followed by a space, then ctx_b.
+endings: a list of 4 endings. The correct index is given by label (0,1,2, or 3)
+split: train, val, or test.
+split_type: indomain if the activity label is seen during training, else zeroshot
+source_id: Which video or WikiHow article this example came from
+"""
+
+import os
+import json
+import requests
+import tiktoken
+from tqdm import tqdm
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+from transformers import GPT2LMHeadModel
+
+DATA_CACHE_DIR = "data"
+
+hellaswags = {
+    "train": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_train.jsonl",
+    "val": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl",
+    "test": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_test.jsonl",
+}
+
+enc = tiktoken.get_encoding("gpt2")
+
+def download_file(url: str, fname: str, chunk_size=1024):
+    """Helper function to download a file from a given url"""
+    resp = requests.get(url, stream=True)
+    total = int(resp.headers.get("content-length", 0))
+    with open(fname, "wb") as file, tqdm(
+        desc=fname,
+        total=total,
+        unit="iB",
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as bar:
+        for data in resp.iter_content(chunk_size=chunk_size):
+            size = file.write(data)
+            bar.update(size)
+
+def download(split):
+    """Downloads HellaSwag DATA_CACHE_DIR"""
+    os.makedirs(DATA_CACHE_DIR, exist_ok=True)
+    data_url = hellaswags[split]
+    data_filename = os.path.join(DATA_CACHE_DIR, f"hellaswag_{split}.jsonl")
+    if not os.path.exists(data_filename):
+        print(f"Downloading {data_url} to {data_filename}...")
+        download_file(data_url, data_filename)
+    else:
+        print(f"{data_filename} already exists, skipping download...")
+
+def render_example(example):
+    """
+    Given the example as a dictionary, render it as three torch tensors:
+    - tokens (the tokens of context + completion, of size 4xN, as there are always 4 candidates)
+    - mask (is 1 in the region of the candidate completion, where we evaluate likelihoods)
+    - label (the index of the correct completion, which we hope has the highest likelihood)
+    """
+    ctx = example["ctx"]
+    label = example["label"]
+    endings = example["endings"]
+
+    # gather up all the tokens
+    ctx_tokens = enc.encode(ctx)
+    tok_rows = []
+    mask_rows = []
+    for end in endings:
+        end_tokens = enc.encode(" " + end) # note: prepending " " because GPT-2 tokenizer
+        tok_rows.append(ctx_tokens + end_tokens)
+        mask_rows.append([0]*len(ctx_tokens) + [1]*len(end_tokens))
+
+    # have to be careful during the collation because the number of tokens in each row can differ
+    max_len = max(len(row) for row in tok_rows)
+    tokens = torch.zeros((4, max_len), dtype=torch.long)
+    mask = torch.zeros((4, max_len), dtype=torch.long)
+    for i, (tok_row, mask_row) in enumerate(zip(tok_rows, mask_rows)):
+        tokens[i, :len(tok_row)] = torch.tensor(tok_row)
+        mask[i, :len(mask_row)] = torch.tensor(mask_row)
+
+    return tokens, mask, label
+
+def iterate_examples(split):
+    download(split)
+    with open(os.path.join(DATA_CACHE_DIR, f"hellaswag_{split}.jsonl"), "r") as f:
+        for line in f:
+            example = json.loads(line)
+            rendered = render_example(example)
+            yield example, rendered
+
+@torch.no_grad()
+def evaluate(model_type="gpt2-xl", device="cuda"):
+
+    model = GPT2LMHeadModel.from_pretrained(model_type)
+    model.to(device)
+
+    num_correct = 0
+    num_total = 0
+    data_it = iterate_examples("val")
+    for example, (tokens, mask, label) in data_it:
+        tokens = tokens.to(device)
+        mask = mask.to(device)
+
+        # get the logits
+        logits = model(tokens).logits
+        # evaluate the autoregressive loss at all positions
+        shift_logits = (logits[..., :-1, :]).contiguous()
+        shift_tokens = (tokens[..., 1:]).contiguous()
+        flat_shift_logits = shift_logits.view(-1, shift_logits.size(-1))
+        flat_shift_tokens = shift_tokens.view(-1)
+        shift_losses = F.cross_entropy(flat_shift_logits, flat_shift_tokens, reduction='none')
+        shift_losses = shift_losses.view(tokens.size(0), -1)
+        # now get the average loss just for the completion region (where mask == 1), in each row
+        shift_mask = (mask[..., :-1]).contiguous()
+        masked_shift_losses = shift_losses * shift_mask
+        # sum and divide by the number of 1s in the mask
+        sum_loss = masked_shift_losses.sum(dim=1)
+        avg_loss = sum_loss / shift_mask.sum(dim=1)
+        # now we have a loss for each of the 4 completions
+        # the one with the lowest loss should be the most likely
+        # to think through more carefully: sum or average? sum is more right probabilistically
+        use_loss = sum_loss
+        # ok predict what the model thinks is the most likely completion
+        pred = use_loss.argmin().item()
+
+        # accumulate stats
+        num_total += 1
+        num_correct += int(pred == label)
+        print(f"accuracy: {num_correct/num_total:.4f} ({num_correct}/{num_total})")
+
+        # debug: pretty print a few examples, and the losses in each case
+        if num_total < 10:
+            print("---")
+            print(f"Context:\n {example['ctx']}")
+            print(f"Endings:")
+            for i, end in enumerate(example["endings"]):
+                print(f"{i} (loss: {use_loss[i].item():.4f}) {end}")
+            print(f"predicted: {pred}, actual: {label}")
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-m", type=str, default="gpt2-xl", help="the model type to use")
+    parser.add_argument("-d", type=str, default="cuda", help="the device to use")
+    args = parser.parse_args()
+    evaluate(args.model_type, args.device)

From aec9ce5d0a9f4b819a552b1438303506ed6f2dd8 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sun, 19 May 2024 12:18:02 +0000
Subject: [PATCH 102/172] move hellaswag file to dev

---
 hellaswag.py => dev/hellaswag.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename hellaswag.py => dev/hellaswag.py (100%)

diff --git a/hellaswag.py b/dev/hellaswag.py
similarity index 100%
rename from hellaswag.py
rename to dev/hellaswag.py

From 7bd2389c1fe302b59f7ddc7dcacdb5f11e7765e2 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sun, 19 May 2024 14:23:51 +0000
Subject: [PATCH 103/172] add mmlu as well and refine both a bit

---
 dev/hellaswag.py |  20 +++---
 dev/mmlu.py      | 156 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 168 insertions(+), 8 deletions(-)
 create mode 100644 dev/mmlu.py

diff --git a/dev/hellaswag.py b/dev/hellaswag.py
index a0fea2afa..cdfe364cc 100644
--- a/dev/hellaswag.py
+++ b/dev/hellaswag.py
@@ -14,6 +14,9 @@
 split: train, val, or test.
 split_type: indomain if the activity label is seen during training, else zeroshot
 source_id: Which video or WikiHow article this example came from
+
+gpt2 (124M) = 28.2%
+gpt2-xl (1558M) = 39.22%
 """
 
 import os
@@ -28,7 +31,7 @@
 
 from transformers import GPT2LMHeadModel
 
-DATA_CACHE_DIR = "data"
+DATA_CACHE_DIR = os.path.join("data", "hellaswag")
 
 hellaswags = {
     "train": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_train.jsonl",
@@ -95,23 +98,24 @@ def render_example(example):
     return tokens, mask, label
 
 def iterate_examples(split):
+    # there are 10,042 examples in total in val
+
     download(split)
     with open(os.path.join(DATA_CACHE_DIR, f"hellaswag_{split}.jsonl"), "r") as f:
         for line in f:
             example = json.loads(line)
-            rendered = render_example(example)
-            yield example, rendered
+            yield example
 
 @torch.no_grad()
-def evaluate(model_type="gpt2-xl", device="cuda"):
+def evaluate(model_type, device):
 
     model = GPT2LMHeadModel.from_pretrained(model_type)
     model.to(device)
 
     num_correct = 0
     num_total = 0
-    data_it = iterate_examples("val")
-    for example, (tokens, mask, label) in data_it:
+    for example in iterate_examples("val"):
+        tokens, mask, label = render_example(example)
         tokens = tokens.to(device)
         mask = mask.to(device)
 
@@ -154,7 +158,7 @@ def evaluate(model_type="gpt2-xl", device="cuda"):
 if __name__ == "__main__":
     import argparse
     parser = argparse.ArgumentParser()
-    parser.add_argument("-m", type=str, default="gpt2-xl", help="the model type to use")
-    parser.add_argument("-d", type=str, default="cuda", help="the device to use")
+    parser.add_argument("-m", "--model_type", type=str, default="gpt2", help="the model type to use")
+    parser.add_argument("-d", "--device", type=str, default="cuda", help="the device to use")
     args = parser.parse_args()
     evaluate(args.model_type, args.device)
diff --git a/dev/mmlu.py b/dev/mmlu.py
new file mode 100644
index 000000000..f15b785b0
--- /dev/null
+++ b/dev/mmlu.py
@@ -0,0 +1,156 @@
+"""
+Downloads and evaluates MMLU in Python.
+This then acts as the reference file for llm.c
+https://github.com/hendrycks/test
+
+gpt2 (124M) ~= 25% (chance)
+gpt2-xl (1558M) = 27.00% ...
+"""
+
+import os
+import requests
+import tiktoken
+import pandas as pd
+from tqdm import tqdm
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+from transformers import GPT2LMHeadModel
+
+DATA_CACHE_DIR = os.path.join("data", "mmlu")
+
+enc = tiktoken.get_encoding("gpt2")
+data_url = "https://people.eecs.berkeley.edu/~hendrycks/data.tar"
+
+def download_file(url: str, fname: str, chunk_size=1024):
+    """Helper function to download a file from a given url"""
+    resp = requests.get(url, stream=True)
+    total = int(resp.headers.get("content-length", 0))
+    with open(fname, "wb") as file, tqdm(
+        desc=fname,
+        total=total,
+        unit="iB",
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as bar:
+        for data in resp.iter_content(chunk_size=chunk_size):
+            size = file.write(data)
+            bar.update(size)
+
+def download():
+    """Downloads MMLU to DATA_CACHE_DIR"""
+    os.makedirs(DATA_CACHE_DIR, exist_ok=True)
+    data_filename = os.path.join(DATA_CACHE_DIR, f"data.tar")
+    if not os.path.exists(data_filename):
+        print(f"Downloading {data_url} to {data_filename}...")
+        download_file(data_url, data_filename)
+        os.system(f"tar -xf {data_filename} -C {DATA_CACHE_DIR}") # untar
+        # creates a directory "data" inside it, with e.g. data/test/*csv
+    else:
+        print(f"{data_filename} already exists, skipping download...")
+
+def iterate_examples():
+    # there are 14,042 examples in total in the test set
+
+    download()
+    test_dir = os.path.join(DATA_CACHE_DIR, "data", "test")
+    csv_files = [f for f in os.listdir(test_dir) if f.endswith(".csv")]
+    for csv_file in csv_files:
+        csv_path = os.path.join(test_dir, csv_file)
+        print(csv_path)
+        df = pd.read_csv(csv_path, header=None)
+        n = df.shape[0]
+        for idx in range(n):
+            example = {
+                "question": df.iloc[idx, 0],
+                "endings": [df.iloc[idx, 1], df.iloc[idx, 2], df.iloc[idx, 3], df.iloc[idx, 4]],
+                "label": df.iloc[idx, 5],
+            }
+            yield example
+
+def render_example(example):
+    """
+    Given the example as a dictionary, render it as three torch tensors:
+    - tokens (the tokens of context + completion, of size 4xN, as there are always 4 candidates)
+    - mask (is 1 in the region of the candidate completion, where we evaluate likelihoods)
+    - label (the index of the correct completion, which we hope has the highest likelihood)
+    """
+    ctx = f"Question: {example['question']}\n\nAnswer:"
+    ctx_tokens = enc.encode(ctx)
+
+    tok_rows = []
+    mask_rows = []
+    for end in example["endings"]:
+        end_tokens = enc.encode(" " + str(end)) # note: prepending " " because GPT-2 tokenizer
+        tok_rows.append(ctx_tokens + end_tokens)
+        mask_rows.append([0]*len(ctx_tokens) + [1]*len(end_tokens))
+
+    # have to be careful during the collation because the number of tokens in each row can differ
+    max_len = max(len(row) for row in tok_rows)
+    tokens = torch.zeros((4, max_len), dtype=torch.long)
+    mask = torch.zeros((4, max_len), dtype=torch.long)
+    for i, (tok_row, mask_row) in enumerate(zip(tok_rows, mask_rows)):
+        tokens[i, :len(tok_row)] = torch.tensor(tok_row)
+        mask[i, :len(mask_row)] = torch.tensor(mask_row)
+
+    label = "ABCD".index(example["label"])
+    return tokens, mask, label
+
+@torch.no_grad()
+def evaluate(model_type, device):
+
+    model = GPT2LMHeadModel.from_pretrained(model_type)
+    model.to(device)
+
+    num_correct = 0
+    num_total = 0
+    for example in iterate_examples():
+        tokens, mask, label = render_example(example)
+        tokens = tokens.to(device)
+        mask = mask.to(device)
+
+        # get the logits
+        logits = model(tokens).logits
+        # evaluate the autoregressive loss at all positions
+        shift_logits = (logits[..., :-1, :]).contiguous()
+        shift_tokens = (tokens[..., 1:]).contiguous()
+        flat_shift_logits = shift_logits.view(-1, shift_logits.size(-1))
+        flat_shift_tokens = shift_tokens.view(-1)
+        shift_losses = F.cross_entropy(flat_shift_logits, flat_shift_tokens, reduction='none')
+        shift_losses = shift_losses.view(tokens.size(0), -1)
+        # now get the average loss just for the completion region (where mask == 1), in each row
+        shift_mask = (mask[..., :-1]).contiguous()
+        masked_shift_losses = shift_losses * shift_mask
+        # sum and divide by the number of 1s in the mask
+        sum_loss = masked_shift_losses.sum(dim=1)
+        avg_loss = sum_loss / shift_mask.sum(dim=1)
+        # now we have a loss for each of the 4 completions
+        # the one with the lowest loss should be the most likely
+        # to think through more carefully: sum or average? sum is more right probabilistically
+        use_loss = sum_loss
+        # ok predict what the model thinks is the most likely completion
+        pred = use_loss.argmin().item()
+
+        # accumulate stats
+        num_total += 1
+        num_correct += int(pred == label)
+        print(f"accuracy: {num_correct/num_total:.4f} ({num_correct}/{num_total})")
+
+        # debug prints
+        if num_total < 10:
+            print("---")
+            print(f"Context:\n {example['question']}")
+            print(f"Endings:")
+            for i, end in enumerate(example["endings"]):
+                print(f"{i} (loss: {use_loss[i].item():.4f}) {end}")
+            print(f"predicted: {pred}, actual: {label}")
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-m", "--model_type", type=str, default="gpt2", help="the model type to use")
+    parser.add_argument("-d", "--device", type=str, default="cuda", help="the device to use")
+    args = parser.parse_args()
+    evaluate(args.model_type, args.device)

From 4e14b5228528da147bab6e1ff71e76a8775dd73a Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sun, 19 May 2024 15:18:04 +0000
Subject: [PATCH 104/172] report both acc and acc_norm

---
 dev/hellaswag.py | 24 ++++++++++++++++--------
 dev/mmlu.py      | 20 +++++++++++++-------
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/dev/hellaswag.py b/dev/hellaswag.py
index cdfe364cc..0a6d11b70 100644
--- a/dev/hellaswag.py
+++ b/dev/hellaswag.py
@@ -15,8 +15,13 @@
 split_type: indomain if the activity label is seen during training, else zeroshot
 source_id: Which video or WikiHow article this example came from
 
-gpt2 (124M) = 28.2%
-gpt2-xl (1558M) = 39.22%
+gpt2 (124M)
+- eleuther harness reports acc 28.92%, acc_norm 31.14% (multiple choice style)
+- this script: 10042 acc: 0.2820 acc_norm: 0.2839
+
+gpt2-xl (1558M)
+- eleuther harness reports acc 40.04%, acc_norm 50.89% (multiple choice style)
+- this script: 10042 acc: 0.3922 acc_norm: 0.4664
 """
 
 import os
@@ -109,9 +114,13 @@ def iterate_examples(split):
 @torch.no_grad()
 def evaluate(model_type, device):
 
+    torch.set_float32_matmul_precision('high') # use tf32
+
     model = GPT2LMHeadModel.from_pretrained(model_type)
     model.to(device)
+    # model = torch.compile(model)
 
+    num_correct_norm = 0
     num_correct = 0
     num_total = 0
     for example in iterate_examples("val"):
@@ -136,15 +145,14 @@ def evaluate(model_type, device):
         avg_loss = sum_loss / shift_mask.sum(dim=1)
         # now we have a loss for each of the 4 completions
         # the one with the lowest loss should be the most likely
-        # to think through more carefully: sum or average? sum is more right probabilistically
-        use_loss = sum_loss
-        # ok predict what the model thinks is the most likely completion
-        pred = use_loss.argmin().item()
+        pred = sum_loss.argmin().item()
+        pred_norm = avg_loss.argmin().item()
 
         # accumulate stats
         num_total += 1
         num_correct += int(pred == label)
-        print(f"accuracy: {num_correct/num_total:.4f} ({num_correct}/{num_total})")
+        num_correct_norm += int(pred_norm == label)
+        print(f"{num_total} acc: {num_correct/num_total:.4f} acc_norm: {num_correct_norm/num_total:.4f}")
 
         # debug: pretty print a few examples, and the losses in each case
         if num_total < 10:
@@ -152,7 +160,7 @@ def evaluate(model_type, device):
             print(f"Context:\n {example['ctx']}")
             print(f"Endings:")
             for i, end in enumerate(example["endings"]):
-                print(f"{i} (loss: {use_loss[i].item():.4f}) {end}")
+                print(f"{i} (loss: {avg_loss[i].item():.4f}) {end}")
             print(f"predicted: {pred}, actual: {label}")
 
 if __name__ == "__main__":
diff --git a/dev/mmlu.py b/dev/mmlu.py
index f15b785b0..67520aeb9 100644
--- a/dev/mmlu.py
+++ b/dev/mmlu.py
@@ -3,8 +3,11 @@
 This then acts as the reference file for llm.c
 https://github.com/hendrycks/test
 
-gpt2 (124M) ~= 25% (chance)
+gpt2 (124M)
+- this script: 14042 acc: 0.2534 acc_norm: 0.2734
+
 gpt2-xl (1558M) = 27.00% ...
+- this script: 14042 acc: 0.2700 acc_norm: 0.2938
 """
 
 import os
@@ -101,9 +104,13 @@ def render_example(example):
 @torch.no_grad()
 def evaluate(model_type, device):
 
+    torch.set_float32_matmul_precision('high') # use tf32
+
     model = GPT2LMHeadModel.from_pretrained(model_type)
     model.to(device)
+    # model = torch.compile(model)
 
+    num_correct_norm = 0
     num_correct = 0
     num_total = 0
     for example in iterate_examples():
@@ -128,15 +135,14 @@ def evaluate(model_type, device):
         avg_loss = sum_loss / shift_mask.sum(dim=1)
         # now we have a loss for each of the 4 completions
         # the one with the lowest loss should be the most likely
-        # to think through more carefully: sum or average? sum is more right probabilistically
-        use_loss = sum_loss
-        # ok predict what the model thinks is the most likely completion
-        pred = use_loss.argmin().item()
+        pred = sum_loss.argmin().item()
+        pred_norm = avg_loss.argmin().item()
 
         # accumulate stats
         num_total += 1
         num_correct += int(pred == label)
-        print(f"accuracy: {num_correct/num_total:.4f} ({num_correct}/{num_total})")
+        num_correct_norm += int(pred_norm == label)
+        print(f"{num_total} acc: {num_correct/num_total:.4f} acc_norm: {num_correct_norm/num_total:.4f}")
 
         # debug prints
         if num_total < 10:
@@ -144,7 +150,7 @@ def evaluate(model_type, device):
             print(f"Context:\n {example['question']}")
             print(f"Endings:")
             for i, end in enumerate(example["endings"]):
-                print(f"{i} (loss: {use_loss[i].item():.4f}) {end}")
+                print(f"{i} (loss: {avg_loss[i].item():.4f}) {end}")
             print(f"predicted: {pred}, actual: {label}")
 
 if __name__ == "__main__":

From 8d55c4a6acc3bcf94b7ddb47b9aeaaeea520e3f4 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sun, 19 May 2024 15:45:28 +0000
Subject: [PATCH 105/172] fix a bug, we have to be careful to make sure we
 evaluate loss at the token just before the first completion token, because
 that is the prediction for the first token, and its accuracy

---
 dev/hellaswag.py | 6 +++---
 dev/mmlu.py      | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/dev/hellaswag.py b/dev/hellaswag.py
index 0a6d11b70..361a7757a 100644
--- a/dev/hellaswag.py
+++ b/dev/hellaswag.py
@@ -17,11 +17,11 @@
 
 gpt2 (124M)
 - eleuther harness reports acc 28.92%, acc_norm 31.14% (multiple choice style)
-- this script: 10042 acc: 0.2820 acc_norm: 0.2839
+- this script: 10042 acc: 0.2859 acc_norm: 0.2955 (completion style)
 
 gpt2-xl (1558M)
 - eleuther harness reports acc 40.04%, acc_norm 50.89% (multiple choice style)
-- this script: 10042 acc: 0.3922 acc_norm: 0.4664
+- this script: 10042 acc: 0.3842 acc_norm: 0.4893 (completion style)
 """
 
 import os
@@ -138,7 +138,7 @@ def evaluate(model_type, device):
         shift_losses = F.cross_entropy(flat_shift_logits, flat_shift_tokens, reduction='none')
         shift_losses = shift_losses.view(tokens.size(0), -1)
         # now get the average loss just for the completion region (where mask == 1), in each row
-        shift_mask = (mask[..., :-1]).contiguous()
+        shift_mask = (mask[..., 1:]).contiguous() # we must shift mask, so we start at the last prompt token
         masked_shift_losses = shift_losses * shift_mask
         # sum and divide by the number of 1s in the mask
         sum_loss = masked_shift_losses.sum(dim=1)
diff --git a/dev/mmlu.py b/dev/mmlu.py
index 67520aeb9..b61fe9324 100644
--- a/dev/mmlu.py
+++ b/dev/mmlu.py
@@ -4,10 +4,10 @@
 https://github.com/hendrycks/test
 
 gpt2 (124M)
-- this script: 14042 acc: 0.2534 acc_norm: 0.2734
+- this script: 14042 acc: 0.2557 acc_norm: 0.2721
 
-gpt2-xl (1558M) = 27.00% ...
-- this script: 14042 acc: 0.2700 acc_norm: 0.2938
+gpt2-xl (1558M)
+- this script: 14042 acc: 0.2927 acc_norm: 0.3035
 """
 
 import os
@@ -128,7 +128,7 @@ def evaluate(model_type, device):
         shift_losses = F.cross_entropy(flat_shift_logits, flat_shift_tokens, reduction='none')
         shift_losses = shift_losses.view(tokens.size(0), -1)
         # now get the average loss just for the completion region (where mask == 1), in each row
-        shift_mask = (mask[..., :-1]).contiguous()
+        shift_mask = (mask[..., 1:]).contiguous() # we must shift mask, so we start at the last prompt token
         masked_shift_losses = shift_losses * shift_mask
         # sum and divide by the number of 1s in the mask
         sum_loss = masked_shift_losses.sum(dim=1)

From 9e645314e65fc44c8d15c81258bddb620cf5f3ca Mon Sep 17 00:00:00 2001
From: Jun Zhang <jun@junz.org>
Date: Mon, 20 May 2024 00:00:21 +0800
Subject: [PATCH 106/172] Check if file exists using platform specific commands
 & Add *.o to gitignore

Signed-off-by: Jun Zhang <jun@junz.org>
---
 .gitignore | 1 +
 Makefile   | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 5e88e4285..f60885e23 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,6 +24,7 @@ dev/cuda/matmul_backward_bias
 dev/cuda/nccl_all_reduce
 *.obj
 *.exe
+*.o
 
 # log files
 *.log
diff --git a/Makefile b/Makefile
index 46abdc9a5..c8b555ac2 100644
--- a/Makefile
+++ b/Makefile
@@ -23,9 +23,15 @@ NVCC_CUDNN =
 USE_CUDNN ?= 0
 
 # Function to check if a file exists in the PATH
+ifneq ($(OS), Windows_NT)
 define file_exists_in_path
-  $(shell where $(1) 2>nul || which $(1) 2>/dev/null)
+  $(which $(1) 2>/dev/null)
 endef
+else
+define file_exists_in_path
+  $(shell where $(1) 2>nul)
+endef
+endif
 
 ifneq ($(CI),true) # if not in CI, then use the GPU query
   ifndef GPU_COMPUTE_CAPABILITY # set to defaults if: make GPU_COMPUTE_CAPABILITY= 

From c2d12f725ebd379eabbf8d4533bee51c20faa3fc Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sun, 19 May 2024 17:07:55 +0000
Subject: [PATCH 107/172] small touchups to grad clip

---
 dev/cuda/global_norm.cu | 11 +++++++----
 train_gpt2.cu           | 33 ++++++++++++++-------------------
 train_gpt2.py           |  6 +++---
 3 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/dev/cuda/global_norm.cu b/dev/cuda/global_norm.cu
index 2295c4976..6c2ed0389 100644
--- a/dev/cuda/global_norm.cu
+++ b/dev/cuda/global_norm.cu
@@ -59,10 +59,15 @@ __global__ void norm_kernel1(float* out, const T* data, size_t count) {
     }
 }
 
-
-
 template<class T>
 __global__ void norm_kernel2(float* out, const T* data, size_t count) {
+    // concrete example for an A100 GPU (108 SMs, 2048 max threads each)
+    // so there are 2048 * 108 = 221,184 threads total
+    // say the block_size is 512, then we would launch 432 blocks in total
+    // say num_params is ~100M, each thread will process ~500 elements
+    // warps reduce with warp-level reduce, we have 221,184/32 = 6,912 warps
+    // and then each warp atomicAdd's to global memory, total of 6,912 atomics
+
     // no shared memory; but one atomic per warp instead of per block
     namespace cg = cooperative_groups;
     cg::thread_block block = cg::this_thread_block();
@@ -84,8 +89,6 @@ __global__ void norm_kernel2(float* out, const T* data, size_t count) {
     }
 }
 
-
-
 template<typename T>
 void global_norm1(float* out, const T* values, size_t count, int block_size) {
     // launch just enough blocks to fill the grid. deliberately no DIV_CEIL.
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 05a0ba2fc..49b92647a 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -1151,31 +1151,26 @@ __device__ float lerp(float start, float end, float weight) {
 template <typename Tp, typename Tg>
 __global__ void adamw_kernel3(Tp* params_memory, float* master_params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters,
                               float learning_rate, float beta1, float beta2, float beta1_correction, float beta2_correction, float eps, float weight_decay,
-                              float* grad_norm, float max_grad_norm,
+                              float* grad_norm, float grad_clip,
                               unsigned int seed) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= num_parameters) { return; }  // guard
 
-    float scale = 1.f;
     if(!isfinite(*grad_norm)) {
         // if we had a numerical problem (e.g, overflow)
-        // in our gradient calculation, don't mess up the
-        // existing weights.
+        // in our gradient norm calculation, don't mess up the existing weights.
         // TODO increase a global counter somewhere so we actually know if/how often this happens
-        if(threadIdx.x == 0 &&  blockIdx.x == 0) {
+        if(threadIdx.x == 0 && blockIdx.x == 0) {
             printf("[WARNING] weight update skipped due to non-finite gradients!\n");
         }
         return;
     }
-    if(*grad_norm > max_grad_norm) {
-        scale = max_grad_norm / sqrtf(*grad_norm);
-        // TODO just for debugging, remove this
-        if(threadIdx.x == 0 &&  blockIdx.x == 0) {
-            printf("[norm %f]\n", sqrtf(*grad_norm));
-        }
-    }
     // get the gradient, m, and v for this parameter
-    float grad = scale * (float)grads_memory[idx];
+    float grad = (float)grads_memory[idx];
+    // clip the gradients if their norm surpasses grad_clip
+    if(*grad_norm > grad_clip) {
+        grad *= grad_clip / sqrtf(*grad_norm);
+    }
     float m = m_memory[idx];
     float v = v_memory[idx];
     // update the first moment (momentum)
@@ -2388,7 +2383,7 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) {
 #endif
 }
 
-void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, float grad_clipping, int t, MultiGpuConfig* multi_gpu_config) {
+void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, float grad_clip, int t, MultiGpuConfig* multi_gpu_config) {
     NVTX_RANGE_FN();
     size_t num_parameters = multi_gpu_config->shard_num_parameters;
     floatX* params_memory = (floatX*)model->params_memory + multi_gpu_config->shard_offset;
@@ -2425,7 +2420,7 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
     adamw_kernel3<<<num_blocks, block_size>>>(params_memory, model->master_weights, grads_memory,
                                                               model->m_memory, model->v_memory, num_parameters,
                                                               learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay,
-                                              grad_norm, grad_clipping,
+                                              grad_norm, grad_clip,
                                               seed);
     cudaCheck(cudaGetLastError());
 }
@@ -2672,7 +2667,7 @@ int main(int argc, char *argv[]) {
     int use_master_weights = 1;
     int recompute = 1; // recompute during backward setting, 0 = none, 1 = recompute gelu
     int zero_stage = 0; // Zero Optimization Stage for Multi-GPU training
-    float grad_clipping  = 1.f;
+    float grad_clip  = 1.0f;
     for (int i = 1; i < argc; i+=2) {
         if (i + 1 >= argc) { error_usage(); } // must have arg after flag
         if (argv[i][0] != '-') { error_usage(); } // must start with dash
@@ -2693,7 +2688,7 @@ int main(int argc, char *argv[]) {
         else if (argv[i][1] == 'a') { overfit_single_batch = atoi(argv[i+1]); }
         else if (argv[i][1] == 'f') { override_enable_tf32 = atoi(argv[i+1]); }
         else if (argv[i][1] == 'w') { use_master_weights = atoi(argv[i+1]); }
-        else if (argv[i][1] == 'c') { grad_clipping = atof(argv[i+1]); }
+        else if (argv[i][1] == 'c') { grad_clip = atof(argv[i+1]); }
         else if (argv[i][1] == 'z') { zero_stage = atoi(argv[i+1]); }
         else if (argv[i][1] == 'r') { recompute = atoi(argv[i+1]); }
         else { error_usage(); }
@@ -2709,7 +2704,7 @@ int main(int argc, char *argv[]) {
     printf0("| sequence length T     | %-50d |\n", T);
     printf0("| total batch size      | %-50d |\n", total_batch_size);
     printf0("| learning rate         | %-50e |\n", learning_rate);
-    printf0("| grad_clipping         | %-50e |\n", grad_clipping);
+    printf0("| grad_clip             | %-50e |\n", grad_clip);
     printf0("| max_steps             | %-50d |\n", max_steps);
     printf0("| val_loss_every        | %-50d |\n", val_loss_every);
     printf0("| val_max_batches       | %-50d |\n", val_max_batches);
@@ -2894,7 +2889,7 @@ int main(int argc, char *argv[]) {
         model.mean_loss = lossf;
         // update the parameters
         gpt2_multi_gpu_accumulate(&model, &multi_gpu_config);
-        gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, grad_clipping, step+1, &multi_gpu_config);
+        gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, grad_clip, step+1, &multi_gpu_config);
         gpt2_multi_gpu_gather(&model, &multi_gpu_config);
         // zero out the gradients for the next iteration
         gpt2_zero_grad(&model);
diff --git a/train_gpt2.py b/train_gpt2.py
index b57fe432c..ab1c3e44d 100644
--- a/train_gpt2.py
+++ b/train_gpt2.py
@@ -407,7 +407,7 @@ def print0(*args, **kwargs):
     parser.add_argument("--batch_size", type=int, default=4, help="batch size, in units of #batch dimensions")
     parser.add_argument("--sequence_length", type=int, default=64, help="sequence length")
     parser.add_argument("--total_batch_size", type=int, default=256, help="total desired batch size, in units of #tokens")
-    parser.add_argument("--grad_clipping", type=float, default=1, help="maximum gradient magnitude")
+    parser.add_argument("--grad_clip", type=float, default=1.0, help="maximum gradient magnitude")
     args = parser.parse_args()
     B, T = args.batch_size, args.sequence_length
     assert 1 <= T <= 1024
@@ -553,7 +553,7 @@ def get_batch():
     if device == "cuda":
         torch.cuda.reset_peak_memory_stats()
     timings = []
-    norm = -1   # dummy value to print in inference-only mode
+    norm = -1.0   # dummy value to print in inference-only mode
     for step in range(args.num_iterations):
         t0 = time.time()
 
@@ -577,7 +577,7 @@ def get_batch():
             # backward pass
             if not args.inference_only:
                 loss.backward()
-        norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clipping)
+        norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
         optimizer.step()
         optimizer.zero_grad(set_to_none=True)
 

From bc58cd1dc1aa6a3aa121033616ffc7f6f01871b5 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sun, 19 May 2024 17:51:29 +0000
Subject: [PATCH 108/172] fix small bugs in grad clip, introduce a GPU CPU
 synch point to communicate the float grad_clip to print it, and small
 printing changes

---
 train_gpt2.cu | 65 ++++++++++++++++++++++++---------------------------
 1 file changed, 31 insertions(+), 34 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 49b92647a..f9cd4e2f7 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -1151,26 +1151,12 @@ __device__ float lerp(float start, float end, float weight) {
 template <typename Tp, typename Tg>
 __global__ void adamw_kernel3(Tp* params_memory, float* master_params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters,
                               float learning_rate, float beta1, float beta2, float beta1_correction, float beta2_correction, float eps, float weight_decay,
-                              float* grad_norm, float grad_clip,
-                              unsigned int seed) {
+                              float grad_scale, unsigned int seed) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= num_parameters) { return; }  // guard
 
-    if(!isfinite(*grad_norm)) {
-        // if we had a numerical problem (e.g, overflow)
-        // in our gradient norm calculation, don't mess up the existing weights.
-        // TODO increase a global counter somewhere so we actually know if/how often this happens
-        if(threadIdx.x == 0 && blockIdx.x == 0) {
-            printf("[WARNING] weight update skipped due to non-finite gradients!\n");
-        }
-        return;
-    }
     // get the gradient, m, and v for this parameter
-    float grad = (float)grads_memory[idx];
-    // clip the gradients if their norm surpasses grad_clip
-    if(*grad_norm > grad_clip) {
-        grad *= grad_clip / sqrtf(*grad_norm);
-    }
+    float grad = grad_scale * (float)grads_memory[idx];
     float m = m_memory[idx];
     float v = v_memory[idx];
     // update the first moment (momentum)
@@ -1196,7 +1182,7 @@ __global__ void adamw_kernel3(Tp* params_memory, float* master_params_memory, Tg
 }
 
 template<class T>
-__global__ void global_norm_kernel(float* out, const T* data, size_t count) {
+__global__ void global_norm_squared_kernel(float* out, const T* data, size_t count) {
     // we want as few atomics as possible, so each block tries to do
     // the maximum amount of work (so no fixed chunk, but instead iterating
     // until we run out of data), and then we reduce inside the block
@@ -1693,7 +1679,7 @@ void fused_classifier(Type* logits, Type* losses,
 }
 
 template<typename T>
-void global_norm(float* out, const T* values, size_t count) {
+void global_norm_squared(float* out, const T* values, size_t count) {
     const int block_size = 512;
     // launch just enough blocks to fill the grid. deliberately no DIV_CEIL.
     // having one block less than possible is a tiny performance hit, having
@@ -1702,7 +1688,9 @@ void global_norm(float* out, const T* values, size_t count) {
     // on all gpus, so the division really is going to be exact.
     const int grid_size = deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount / block_size;
     assert(grid_size > 0);      // gives a better error than letting the call below fail
-    global_norm_kernel<<<grid_size, block_size>>>(out, values, count);
+    // initialize out with zero
+    cudaCheck(cudaMemset(out, 0, sizeof(float)));
+    global_norm_squared_kernel<<<grid_size, block_size>>>(out, values, count);
     cudaCheck(cudaGetLastError());
 }
 
@@ -2383,7 +2371,7 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) {
 #endif
 }
 
-void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, float grad_clip, int t, MultiGpuConfig* multi_gpu_config) {
+float gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, float grad_clip, int t, MultiGpuConfig* multi_gpu_config) {
     NVTX_RANGE_FN();
     size_t num_parameters = multi_gpu_config->shard_num_parameters;
     floatX* params_memory = (floatX*)model->params_memory + multi_gpu_config->shard_offset;
@@ -2404,25 +2392,34 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
         }
     }
 
-    // repurposing this buffer. We calculate the gradient norm on the GPU, and need it in the next kernel,
-    // so we _really_ don't want to transfer it here as an actual float. So we just pass around a pointer
-    // to this memory that is not otherwise needed during the update phase.
-    float* grad_norm = (float*)model->acts.output;
-
-    // global gradient norm
-    global_norm(grad_norm, (floatX*)model->grads_memory, model->num_parameters);
+    // gradient clipping
+    // repurposing this buffer (which isn't needed now) to write grad norm into it
+    float* grad_norm_squared = (float*)model->acts.output;
+    global_norm_squared(grad_norm_squared, (floatX*)model->grads_memory, model->num_parameters);
+    // transfer the gradient norm to CPU
+    float grad_norm_squared_cpu = 0.0f;
+    cudaCheck(cudaMemcpy(&grad_norm_squared_cpu, grad_norm_squared, sizeof(float), cudaMemcpyDeviceToHost));
+    if(!isfinite(grad_norm_squared_cpu)) {
+        // may happen due to some issue (e.g. overflow?)
+        // TODO: later may want to keep a global counter of instabilities like this
+        printf0("[WARNING]: grad norm is not finite, skipping AdamW update\n");
+        return -1.0f;
+    }
+    float grad_norm_cpu = sqrtf(grad_norm_squared_cpu);
+    float grad_scale = (grad_norm_cpu > grad_clip) ? grad_clip / grad_norm_cpu : 1.0f;
 
+    // AdamW update
     int block_size = 512;
     int num_blocks = CEIL_DIV(num_parameters, block_size);
     float beta1_correction = 1.0f - powf(beta1, t);
     float beta2_correction = 1.0f - powf(beta2, t);
     unsigned int seed = random_u32(&model->rng_state);
     adamw_kernel3<<<num_blocks, block_size>>>(params_memory, model->master_weights, grads_memory,
-                                                              model->m_memory, model->v_memory, num_parameters,
-                                                              learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay,
-                                              grad_norm, grad_clip,
-                                              seed);
+                                              model->m_memory, model->v_memory, num_parameters,
+                                              learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay,
+                                              grad_scale, seed);
     cudaCheck(cudaGetLastError());
+    return grad_norm_cpu;
 }
 
 void gpt2_multi_gpu_gather(GPT2 *model, MultiGpuConfig* multi_gpu_config)
@@ -2889,7 +2886,7 @@ int main(int argc, char *argv[]) {
         model.mean_loss = lossf;
         // update the parameters
         gpt2_multi_gpu_accumulate(&model, &multi_gpu_config);
-        gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, grad_clip, step+1, &multi_gpu_config);
+        float grad_norm = gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, grad_clip, step+1, &multi_gpu_config);
         gpt2_multi_gpu_gather(&model, &multi_gpu_config);
         // zero out the gradients for the next iteration
         gpt2_zero_grad(&model);
@@ -2911,8 +2908,8 @@ int main(int argc, char *argv[]) {
             bias_corrected_ema_tokens_per_second = ema_tokens_per_second / (1.0f - powf(0.95f, step));
         }
         float accumulated_loss = multi_gpu_config.num_processes == 1 ? model.mean_loss : model.accumulated_mean_loss;
-        printf0("step %4d/%d: train loss %f (acc %f) (%f ms, %0f tok/s)\n",
-                step + 1, train_num_batches, model.mean_loss, accumulated_loss,
+        printf0("step %4d/%d: train loss %f norm %.4f (%.2f ms, %.0f tok/s)\n",
+                step + 1, train_num_batches, accumulated_loss, grad_norm,
                 time_elapsed_ms, bias_corrected_ema_tokens_per_second);
         logger_log_train(&logger, step, model.mean_loss);
 

From 299ce659ec571bd843a10010115d45742df6d285 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Sun, 19 May 2024 22:25:00 +0300
Subject: [PATCH 109/172] initialize multi_gpu_config so profile doesn't crash

---
 profile_gpt2.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/profile_gpt2.cu b/profile_gpt2.cu
index 4b24c8973..1a1ad51b9 100644
--- a/profile_gpt2.cu
+++ b/profile_gpt2.cu
@@ -49,6 +49,7 @@ int main(int argc, char *argv[]) {
 
     // override number of layers to 1 because all layers repeat the same kernels, only profile once
     model.config.num_layers = 1;
+    set_zero_configs(&multi_gpu_config, 0, model.num_parameters);
 
     // do a training step
     gpt2_forward(&model, x, y, B, T);

From ead5d3597381a71eddb517775bc7509383651b36 Mon Sep 17 00:00:00 2001
From: Christopher <christopher.paul.dryden@gmail.com>
Date: Sun, 19 May 2024 22:54:47 +0000
Subject: [PATCH 110/172] Added warpsize as a constant for better compile time
 optimization and standardization

---
 train_gpt2.cu | 67 ++++++++++++++++++++++++++-------------------------
 1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 030c5c9b7..741d1a7c8 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -114,6 +114,9 @@ class NvtxRange {
 #define MAX_1024_THREADS_BLOCKS 1
 #endif
 
+// WarpSize is not a compile time constant, this allows the compiler to optimize
+#define WARP_SIZE 32U
+
 // cuBLAS workspace. Hardcoding to 32MiB but only Hopper needs 32, for others 4 is OK
 const size_t cublaslt_workspace_size = 32 * 1024 * 1024;
 void* cublaslt_workspace = NULL;
@@ -203,10 +206,10 @@ template<reduction_func_t warp_reduction>
 __device__ float blockReduce(float val, bool final_sync=false, float out_of_bounds=0.0f) {
     // two reductions of up to 1024 threads:
     // 1) inside warp (shuffle), 2) cross-warp (shared memory), 3) inside warp (shuffle)
-    __shared__ float shared_val[32];
-    const int lane_id = threadIdx.x % 32;
-    const int warp_id = threadIdx.x / 32;
-    const int num_warps = blockDim.x / 32;
+    __shared__ float shared_val[WARP_SIZE];
+    const int lane_id = threadIdx.x % WARP_SIZE;
+    const int warp_id = threadIdx.x / WARP_SIZE;
+    const int num_warps = blockDim.x / WARP_SIZE;
 
     float warp_val = warp_reduction(val);
     if (lane_id == 0) { shared_val[warp_id] = warp_val; }
@@ -578,10 +581,9 @@ __global__ void encoder_backward_kernel(floatX* dwte, floatX* dwpe,
 __global__ void layernorm_forward_kernel3(floatX* __restrict__ out, floatX* __restrict__ mean, floatX* __restrict__ rstd,
                                     const floatX*  __restrict__ inp, const floatX*  __restrict__ weight,
                                     const floatX* __restrict__ bias, int N, int C) {
-    const int warp_size = 32;
-    int lane_id = threadIdx.x % warp_size;
-    int warp_id = threadIdx.x / warp_size;
-    int num_warps = blockDim.x / warp_size;
+    int lane_id = threadIdx.x % WARP_SIZE;
+    int warp_id = threadIdx.x / WARP_SIZE;
+    int num_warps = blockDim.x / WARP_SIZE;
 
     int idx = blockIdx.x * num_warps + warp_id;
     if(idx >= N) { return; } // guard
@@ -591,7 +593,7 @@ __global__ void layernorm_forward_kernel3(floatX* __restrict__ out, floatX* __re
 
     // mean
     float sum = 0.0f;
-    for (int i = lane_id; i < C; i += warp_size) {
+    for (int i = lane_id; i < C; i += WARP_SIZE) {
         sum += (float)x[i];
     }
     sum = warpReduceSum(sum);
@@ -602,7 +604,7 @@ __global__ void layernorm_forward_kernel3(floatX* __restrict__ out, floatX* __re
 
     // rstd
     sum = 0.0f;
-    for (int i = lane_id; i < C; i += warp_size) {
+    for (int i = lane_id; i < C; i += WARP_SIZE) {
         float diff = (float)x[i] - m;
         sum += diff * diff;
     }
@@ -614,7 +616,7 @@ __global__ void layernorm_forward_kernel3(floatX* __restrict__ out, floatX* __re
 
     // final normalization and scaling by weight/bias
     floatX* o = out + idx * C;
-    for (int c = lane_id; c < C; c += warp_size) {
+    for (int c = lane_id; c < C; c += WARP_SIZE) {
         // load and store using the .cs "streaming" hint to the compiler,
         // indicating that this data will not be reused soon, and can be streamed through the caches
         // this allows the threads to get more cache-hits for the (shared) weight and bias parameters
@@ -627,8 +629,7 @@ __global__ void fused_residual_forward_kernel5(floatX* residual, floatX* normed,
                                                const floatX* inp1, const floatX* inp2,
                                                const floatX* weight, const floatX* bias,
                                                int N, int C) {
-    constexpr const int WarpSize = 32;
-    assert(blockDim.x == WarpSize);
+    assert(blockDim.x == WARP_SIZE);
 
     // load weights and biases into shared memory
     // do this before we allow any threads to exit!
@@ -639,8 +640,8 @@ __global__ void fused_residual_forward_kernel5(floatX* residual, floatX* normed,
     x128* s_bias = reinterpret_cast<x128*>(params) + (C / x128::size);
     x128* s_res = reinterpret_cast<x128*>(params) + ((2 + threadIdx.y) * C / x128::size);
 
-    int sidx = (threadIdx.x + WarpSize * threadIdx.y) * x128::size;
-    for(int i = sidx; i < C; i += blockDim.y * WarpSize * x128::size) {
+    int sidx = (threadIdx.x + WARP_SIZE * threadIdx.y) * x128::size;
+    for(int i = sidx; i < C; i += blockDim.y * WARP_SIZE * x128::size) {
         s_weight[i/x128::size] = load128(weight + i);
         s_bias[i/x128::size] = load128(bias + i);
     }
@@ -657,7 +658,7 @@ __global__ void fused_residual_forward_kernel5(floatX* residual, floatX* normed,
 
     const float eps = 1e-5f;
     float sum = 0.0f;
-    for(int c = threadIdx.x * x128::size; c < C; c += WarpSize * x128::size) {
+    for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) {
         const x128 in1 = load128cs(inp1 + c);
         const x128 in2 = load128cs(inp2 + c);
         x128 out;
@@ -673,7 +674,7 @@ __global__ void fused_residual_forward_kernel5(floatX* residual, floatX* normed,
     float m = sum / C;
     float v = 0.f;
 
-    for(int c = threadIdx.x * x128::size; c < C; c += WarpSize * x128::size) {
+    for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) {
         const x128 res = s_res[c / x128::size];
         for(int k = 0; k < x128::size; ++k) {
             v += ((float)res[k] - m) * ((float)res[k] - m);
@@ -683,7 +684,7 @@ __global__ void fused_residual_forward_kernel5(floatX* residual, floatX* normed,
     v = warpReduceSum(v) / C;
     float s = rsqrtf(v + eps);
 
-    for(int c = threadIdx.x * x128::size; c < C; c += WarpSize * x128::size) {
+    for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) {
         const x128 res = s_res[c / x128::size];
         const x128 w = s_weight[c / x128::size];
         const x128 b = s_bias[c / x128::size];
@@ -898,7 +899,7 @@ template<typename OutFloat, bool UseAuxBuffer>
 __global__ void matmul_backward_bias_kernel9(OutFloat* dbias, const floatX* dout, int B, int T, int OC,
                                              std::bool_constant<UseAuxBuffer>) {
     constexpr const int bdx = 4;
-    constexpr const int bdy = 32 / bdx;
+    constexpr const int bdy = WARP_SIZE / bdx;
     assert(blockDim.x == bdx);
     assert(blockDim.y == bdy);
 
@@ -929,7 +930,7 @@ __global__ void matmul_backward_bias_kernel9(OutFloat* dbias, const floatX* dout
         }
     }
 
-    __shared__ float sub_results[x128::size][32][bdy];
+    __shared__ float sub_results[x128::size][WARP_SIZE][bdy];
 
     // reduce within-warp results
     for (int k = 0; k < x128::size; k++) {
@@ -988,12 +989,12 @@ __global__ void __launch_bounds__(512, 3) // todo - any warnings on Turing with
                                             const floatX* mean, const floatX* rstd,
                                             int B, int T, int C) {
     extern __shared__ float shared[]; // size = 2 * C + 1
-    int warpId = threadIdx.x / warpSize; // warp index within a block
-    int warpsInBlock = blockDim.x / warpSize; //number of warps in block
+    int warpId = threadIdx.x / WARP_SIZE; // warp index within a block
+    int warpsInBlock = blockDim.x / WARP_SIZE; //number of warps in block
     int baseIdx = blockIdx.x * warpsInBlock + warpId;
-    int warpThreadIdx = threadIdx.x % warpSize; // Thread index within the warp
+    int warpThreadIdx = threadIdx.x % WARP_SIZE; // Thread index within the warp
     int warpsInGrid = gridDim.x * warpsInBlock;
-    int C_per_iteration = warpSize * x128::size;
+    int C_per_iteration = WARP_SIZE * x128::size;
     int iterations_C = C / C_per_iteration;
 
     // the first half of shared memory is bias, second is weight
@@ -1021,7 +1022,7 @@ __global__ void __launch_bounds__(512, 3) // todo - any warnings on Turing with
         // first: two reduce operations
         float dnorm_mean = 0.0f;
         float dnorm_norm_mean = 0.0f;
-        for (int i = warpThreadIdx * x128::size; i < C; i += warpSize * x128::size) {
+        for (int i = warpThreadIdx * x128::size; i < C; i += WARP_SIZE * x128::size) {
             x128 dout128_i   = load128(dout_bt + i);
             x128 inp128_i    = load128(inp_bt  + i);
             x128 weight128_i = load128(weight  + i);
@@ -1053,9 +1054,9 @@ __global__ void __launch_bounds__(512, 3) // todo - any warnings on Turing with
                 float norm_bti = ((float)inp128[x] - mean_bt) * rstd_bt;
                 float dnorm_i = (float)weight128[x] * dout_i;
                 // gradient contribution to bias (using shared memory friendly index)
-                atomicAdd(&dbias_shared[shared_index + x*warpSize], dout_i);
+                atomicAdd(&dbias_shared[shared_index + x*WARP_SIZE], dout_i);
                 // gradient contribution to weight (using shared memory friendly index)
-                atomicAdd(&dweight_shared[shared_index + x*warpSize], norm_bti * dout_i);
+                atomicAdd(&dweight_shared[shared_index + x*WARP_SIZE], norm_bti * dout_i);
                 // gradient contribution to input
                 float dval = 0.0f;
                 dval += dnorm_i; // term 1
@@ -1095,8 +1096,8 @@ __global__ void __launch_bounds__(512, 3) // todo - any warnings on Turing with
             x128 dbias128 = load128(dbias + global_index);
             x128 dweight128 = load128(dweight + global_index);
             for (int x = 0; x < x128::size; x++) {
-                float s_db = scratch_dbias[shared_index + x*warpSize];
-                float s_dw = scratch_dweight[shared_index + x*warpSize];
+                float s_db = scratch_dbias[shared_index + x*WARP_SIZE];
+                float s_dw = scratch_dweight[shared_index + x*WARP_SIZE];
                 dbias128[x] = (floatX)(s_db + (float)dbias128[x]);
                 dweight128[x] = (floatX)(s_dw + (float)dweight128[x]);
             }
@@ -1351,7 +1352,7 @@ void layernorm_forward(floatX* out, floatX* mean, floatX* rstd,
     NVTX_RANGE_FN();
     const int block_size = 512;
     const int N = B * T;
-    const int grid_size = CEIL_DIV(N * 32, block_size);
+    const int grid_size = CEIL_DIV(N * WARP_SIZE, block_size);
     layernorm_forward_kernel3<<<grid_size, block_size>>>(out, mean, rstd, inp, weight, bias, N, C);
     cudaCheck(cudaGetLastError());
 }
@@ -1496,7 +1497,7 @@ void fused_residual_forward5(floatX* residual, floatX* normed, floatX* mean, flo
                              const floatX* weight, const floatX* bias,
                              int N, int C) {
     const int block_size = 256;
-    int block_y = block_size / 32;
+    int block_y = block_size / WARP_SIZE;
     const int grid_size = CEIL_DIV(N, block_y);
     size_t smem = (2 + block_y) * C * sizeof(floatX);
 
@@ -1506,7 +1507,7 @@ void fused_residual_forward5(floatX* residual, floatX* normed, floatX* mean, flo
     auto status = cudaFuncSetAttribute(fused_residual_forward_kernel5, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
     cudaGetLastError();
     if(status == cudaSuccess) {
-        fused_residual_forward_kernel5<<<grid_size, dim3(32, block_y), smem>>>(residual, normed, mean, rstd, inp1, inp2,
+        fused_residual_forward_kernel5<<<grid_size, dim3(WARP_SIZE, block_y), smem>>>(residual, normed, mean, rstd, inp1, inp2,
                                                                                weight, bias, N, C);
     } else {
         residual_forward(residual, inp1, inp2, N*C);
@@ -1546,7 +1547,7 @@ void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias,
 
         const int block_size = deviceProp.maxThreadsPerMultiProcessor == 1536 ? 768 : 1024;
 
-        dim3 block_dim = {4, 8, (unsigned)block_size/32};
+        dim3 block_dim = {4, 8, (unsigned)block_size/WARP_SIZE};
         const int OC_per_warp = block_dim.y * x128::size; // 64 at BF16
         const int grid_size_x = CEIL_DIV(OC, OC_per_warp); // e.g. 12 horizontal blocks for 768 OCs at BF16
         const int grid_size_y = max(1, deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount / (block_size * grid_size_x)); // full GPU!

From 6de1137e0ec531c9660afc4048de09588dc18c5e Mon Sep 17 00:00:00 2001
From: Christopher <christopher.paul.dryden@gmail.com>
Date: Sun, 19 May 2024 23:11:57 +0000
Subject: [PATCH 111/172] Moved bounds checks outside of kernel into assertions

---
 train_gpt2.cu | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 030c5c9b7..cdffff4eb 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -844,9 +844,8 @@ __global__ void softmax_forward_kernel5(floatX* out, float inv_temperature, cons
     }
 }
 
-__global__ void residual_forward_kernel(floatX* out, const floatX* inp1, const floatX* inp2, int N) {
+__global__ void residual_forward_kernel(floatX* out, const floatX* inp1, const floatX* inp2) {
     int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
-    if (idx >= N) { return; }
 
     x128 packed_out;
     x128 packed_inp1 = load128cs(inp1 + idx);
@@ -858,9 +857,8 @@ __global__ void residual_forward_kernel(floatX* out, const floatX* inp1, const f
 }
 
 #define GELU_SCALING_FACTOR sqrtf(2.0f / M_PI)
-__global__ void gelu_forward_kernel2(floatX* out, const floatX* inp, int N) {
+__global__ void gelu_forward_kernel2(floatX* out, const floatX* inp) {
     int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
-    if (idx >= N) { return; }
 
     x128 packed_out;
     x128 packed_inp = load128cs(inp + idx); // load and do not keep in cache
@@ -874,9 +872,8 @@ __global__ void gelu_forward_kernel2(floatX* out, const floatX* inp, int N) {
     store128(out + idx, packed_out);
 }
 
-__global__ void gelu_backward_kernel(floatX* dinp, const floatX* inp, const floatX* dout, const int N) {
+__global__ void gelu_backward_kernel(floatX* dinp, const floatX* inp, const floatX* dout) {
     int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
-    if (idx >= N) { return; }
 
     x128 packed_dinp;
     x128 packed_inp = load128cs(inp + idx);
@@ -1486,8 +1483,9 @@ void attention_forward(floatX* out, floatX* qkvr, floatX* att,
 void residual_forward(floatX* out, const floatX* inp1, const floatX* inp2, int N) {
     NVTX_RANGE_FN();
     const int block_size = 256;
+    assert(N % block_size == 0);
     const int grid_size = CEIL_DIV(N, block_size * x128::size);
-    residual_forward_kernel<<<grid_size, block_size>>>(out, inp1, inp2, N);
+    residual_forward_kernel<<<grid_size, block_size>>>(out, inp1, inp2);
     cudaCheck(cudaGetLastError());
 }
 
@@ -1519,16 +1517,18 @@ void fused_residual_forward5(floatX* residual, floatX* normed, floatX* mean, flo
 void gelu_forward(floatX* out, const floatX* inp, int N) {
     NVTX_RANGE_FN();
     const int block_size = 512;
+    assert(N % block_size == 0);
     const int grid_size = CEIL_DIV(N, block_size * x128::size);
-    gelu_forward_kernel2<<<grid_size, block_size>>>(out, inp, N);
+    gelu_forward_kernel2<<<grid_size, block_size>>>(out, inp);
     cudaCheck(cudaGetLastError());
 }
 
 void gelu_backward(floatX* dinp, const floatX* inp, const floatX* dout, const int N) {
     NVTX_RANGE_FN();
     const int block_size = 128;
+    assert(N % block_size == 0);
     const int grid_size = CEIL_DIV(N, block_size * x128::size);
-    gelu_backward_kernel<<<grid_size, block_size>>>(dinp, inp, dout, N);
+    gelu_backward_kernel<<<grid_size, block_size>>>(dinp, inp, dout);
     cudaCheck(cudaGetLastError());
 }
 

From 6348d4196d6857244d7833988c405e44afe578d7 Mon Sep 17 00:00:00 2001
From: lancer <tangshao28@gmail.com>
Date: Sun, 19 May 2024 17:39:25 -0700
Subject: [PATCH 112/172] fix the unsupported block_size

---
 dev/cuda/matmul_backward_bias.cu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu
index 12b167083..52d793ac7 100644
--- a/dev/cuda/matmul_backward_bias.cu
+++ b/dev/cuda/matmul_backward_bias.cu
@@ -421,6 +421,9 @@ __global__ void reduce_add_sum_kernel(floatX* dst, const float* src, size_t n, s
 // version1: simple cuBLAS calls
 void matmul_backward_bias1(floatX* dbias, const floatX* dout,
                       int B, int T, int OC, int block_size) {
+    if (block_size == 768) {
+        block_size = 1024;      // block_size needs to be power of 2 due to the reduction
+    }
     dim3 block_dim(block_size);
     dim3 grid_dim(OC);
     size_t shared_mem_size = block_size * sizeof(float);

From 2b0667aee15151622797d6bc209eec8f4742f3a7 Mon Sep 17 00:00:00 2001
From: lancer <tangshao28@gmail.com>
Date: Mon, 20 May 2024 08:00:39 -0700
Subject: [PATCH 113/172] update the utils function and assert

---
 dev/cuda/matmul_backward_bias.cu | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu
index 52d793ac7..16172bcf2 100644
--- a/dev/cuda/matmul_backward_bias.cu
+++ b/dev/cuda/matmul_backward_bias.cu
@@ -27,6 +27,26 @@ sudo ncu --set full --import-source yes -o bias -f ./matmul_backward_bias 1
 #define ENABLE_BF16
 #include "common.h"
 
+
+// ----------------------------------------------------------------------------
+// utility functions
+__host__ __device__ bool isPowerOfTwo(int n) {
+    return (n > 0) && ((n & (n - 1)) == 0);
+}
+
+__host__ __device__ int largestPowerOfTwoLessOrEqual(int n) {
+    // Return the largest power of 2 less than or equal to n
+    if (n < 1) {
+        return 0;
+    }
+
+    while ((n & (n - 1)) > 0) {
+        n = n & (n - 1);
+    }
+
+    return n;
+}
+
 // ----------------------------------------------------------------------------
 // CPU code reference
 
@@ -421,9 +441,8 @@ __global__ void reduce_add_sum_kernel(floatX* dst, const float* src, size_t n, s
 // version1: simple cuBLAS calls
 void matmul_backward_bias1(floatX* dbias, const floatX* dout,
                       int B, int T, int OC, int block_size) {
-    if (block_size == 768) {
-        block_size = 1024;      // block_size needs to be power of 2 due to the reduction
-    }
+    block_size = largestPowerOfTwoLessOrEqual(block_size);
+    assert(isPowerOfTwo(block_size)); // block_size needs to be power of 2 due to the reduction
     dim3 block_dim(block_size);
     dim3 grid_dim(OC);
     size_t shared_mem_size = block_size * sizeof(float);

From 722e5b2fe5a4a9cdcaf7041178737b2e14a91591 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Mon, 20 May 2024 22:43:40 +0000
Subject: [PATCH 114/172] refactor how we treat datasets, because we're about
 to have more of them and we don't want them to clutter up root dir etc. this
 is only step 1, i'm about to refactor a bunch of the dataloading, how the
 .bin files work and are loaded, how the DataLoader works, etc. This is all
 needed to support good evals and training at scale

---
 .github/workflows/ci.yml                      | 30 ++++++++---------
 README.md                                     | 26 +++++++--------
 dev/{ => data}/hellaswag.py                   | 21 ++----------
 dev/{ => data}/mmlu.py                        | 21 ++----------
 .../data/tinyshakespeare.py                   | 33 +++++--------------
 .../data/tinystories.py                       | 33 ++++++-------------
 train_gpt2.c                                  |  8 ++---
 train_gpt2.cu                                 |  4 +--
 train_gpt2.py                                 |  2 +-
 train_gpt2_fp32.cu                            |  4 +--
 10 files changed, 61 insertions(+), 121 deletions(-)
 rename dev/{ => data}/hellaswag.py (92%)
 rename dev/{ => data}/mmlu.py (90%)
 rename prepro_tinyshakespeare.py => dev/data/tinyshakespeare.py (70%)
 rename prepro_tinystories.py => dev/data/tinystories.py (82%)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index bb19f2ba5..e4c211bc1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -21,7 +21,7 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Install OpenMP
-        if: matrix.os != 'windows-latest'    
+        if: matrix.os != 'windows-latest'
         run: |
           if [ "${{ runner.os }}" == "Linux" ]; then
             sudo apt-get update && sudo apt-get install -y libomp-dev
@@ -33,7 +33,7 @@ jobs:
         run: pip install -r requirements.txt
 
       - name: Run preprocessing
-        run: python prepro_tinyshakespeare.py
+        run: python dev/data/tinyshakespeare.py
 
       - name: Train model
         run: python train_gpt2.py --device=cpu
@@ -45,9 +45,9 @@ jobs:
             $url = 'https://github.com/maweil/MakeForWindows/releases/download/v4.4.1/make-bin-win64.zip'
             $output = './make-bin-win64.zip'
             $wc.DownloadFile($url, $output)
-  
+
       - name: Unzip Win32 Makefile
-        if: matrix.os == 'windows-latest'      
+        if: matrix.os == 'windows-latest'
         run: |
           unzip make-bin-win64.zip
 
@@ -59,26 +59,26 @@ jobs:
         if: matrix.os == 'windows-latest'
         shell: cmd
         run: |
-          call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\VC\\Auxiliary\\Build\\vcvars64.bat"        
+          call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\VC\\Auxiliary\\Build\\vcvars64.bat"
           make-4.4.1\dist\make WIN_CI_BUILD=1 test_gpt2 train_gpt2
 
       - name: Execute testing program (With OpenMP)
-        if: matrix.os != 'windows-latest'      
+        if: matrix.os != 'windows-latest'
         run: OMP_NUM_THREADS=8 ./test_gpt2
 
-      - name: Execute Windows testing program (With OpenMP) 
-        if: matrix.os == 'windows-latest'      
+      - name: Execute Windows testing program (With OpenMP)
+        if: matrix.os == 'windows-latest'
         shell: cmd
         run: |
           copy test_gpt2 test_gpt2.exe
-          test_gpt2.exe        
+          test_gpt2.exe
 
       - name: Compile training and testing program without OpenMP
-        if: matrix.os != 'windows-latest'      
+        if: matrix.os != 'windows-latest'
         run: NO_OMP=1 make test_gpt2 train_gpt2
 
       - name: Execute testing program (No OpenMP)
-        if: matrix.os != 'windows-latest'      
+        if: matrix.os != 'windows-latest'
         run: ./test_gpt2
 
   build-cuda-windows:
@@ -93,11 +93,11 @@ jobs:
           $url = 'https://github.com/maweil/MakeForWindows/releases/download/v4.4.1/make-bin-win64.zip'
           $output = './make-bin-win64.zip'
           $wc.DownloadFile($url, $output)
-  
+
     - name: Unzip Win32 Makefile
       run: |
         unzip make-bin-win64.zip
-      
+
     - name: Install Cuda Toolkit 12.4 on Windows
       run: |
         mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
@@ -134,9 +134,9 @@ jobs:
       shell: cmd
       working-directory: ${{ github.workspace }}
       run: |
-        call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\VC\\Auxiliary\\Build\\vcvars64.bat"        
+        call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\VC\\Auxiliary\\Build\\vcvars64.bat"
         make-4.4.1\dist\make -j WIN_CI_BUILD=1 train_gpt2fp32cu test_gpt2fp32cu test_gpt2cu train_gpt2cu profile_gpt2cu
-  
+
   build-cuda-fp32:
     runs-on: ubuntu-latest
     container:
diff --git a/README.md b/README.md
index 469326fbf..aee282fe0 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ The "I don't care about anything I just want to train and I have a GPU" section.
 
 ```bash
 pip install -r requirements.txt
-python prepro_tinyshakespeare.py
+python dev/data/tinyshakespeare.py
 python train_gpt2.py
 make train_gpt2fp32cu
 ./train_gpt2fp32cu
@@ -22,17 +22,17 @@ The above lines (1) download the [tinyshakespeare](https://raw.githubusercontent
 
 ## quick start (GPU, fast bleeding edge)
 
-I want to see it go fast. In this case switch to our mainline, most optimized `train_gpt2.cu` and also turn on flash attention. Run:
+I want to see it go fast. In this case switch to our mainline, most optimized `train_gpt2.cu`. Run:
 
 ```bash
 pip install -r requirements.txt
-python prepro_tinyshakespeare.py
+python dev/data/tinyshakespeare.py
 python train_gpt2.py
 make train_gpt2cu
 ./train_gpt2cu
 ```
 
-If you additionally install cuDNN (see the CUDA section below), you can also go faster with flash attention
+If you additionally install cuDNN (see the CUDA section below), you can go even faster with flash attention. Adjust the make command as follows to compile with cudnn / flash attention:
 
 ```bash
 make train_gpt2cu USE_CUDNN=1
@@ -48,9 +48,9 @@ Note that the default batch size is very low (4). If you have enough memory on y
 My standard "prod" run with a nice GPU (e.g. A100 40GB) actually trains on TinyStories instead of TinyShakespeare, and looks like this:
 
 ```bash
-python prepro_tinystories.py
+python dev/data/tinystories.py
 make train_gpt2cu USE_CUDNN=1
-./train_gpt2cu -i data/TinyStories -v 250 -s 250 -g 144 -o stories.log -b 32
+./train_gpt2cu -i dev/data/tinystories/TinyStories -v 250 -s 250 -g 144 -o stories.log -b 32
 ```
 
 Where I decrease the frequency of validation loss and sampling to every 250 steps, sample 144 tokens during sampling stage (to fit ~one story), and at batch size 32.
@@ -61,7 +61,7 @@ The "I am so GPU poor that I don't even have one" section. No worries, run:
 
 ```bash
 pip install -r requirements.txt
-python prepro_tinyshakespeare.py
+python dev/data/tinyshakespeare.py
 python train_gpt2.py
 make train_gpt2
 OMP_NUM_THREADS=8 ./train_gpt2
@@ -73,10 +73,10 @@ The above lines (1) download the [tinyshakespeare](https://raw.githubusercontent
 
 You'll be using the (more bleeding edge) mixed precision version of the code:
 
-```
+```bash
 sudo apt install openmpi-bin openmpi-doc libopenmpi-dev
 pip install -r requirements.txt
-python prepro_tinyshakespeare.py
+python dev/data/tinyshakespeare.py
 python train_gpt2.py
 make train_gpt2cu
 mpirun -np <number of GPUs on your machine> ./train_gpt2cu
@@ -89,17 +89,17 @@ Sub in the number of GPUs you'd like to run on in the last command.
 Download and tokenize a dataset. The [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset is the fastest to download and tokenize:
 
 ```bash
-python prepro_tinyshakespeare.py
+python dev/data/tinyshakespeare.py
 ```
 
 This prints:
 
 ```
-Saved 32768 tokens to data/tiny_shakespeare_val.bin
-Saved 305260 tokens to data/tiny_shakespeare_train.bin
+Saved 32768 tokens to (...)/tiny_shakespeare_val.bin
+Saved 305260 tokens to (...)/tiny_shakespeare_train.bin
 ```
 
-The .bin files are raw byte streams of int32 numbers indicating the token ids with the GPT-2 tokenizer. Alternatively you could also tokenize the [TinyStories](https://huggingface.co/datasets/roneneldan/TinyStories) dataset with `prepro_tinystories.py`.
+The .bin files are raw byte streams of int32 numbers indicating the token ids with the GPT-2 tokenizer. Alternatively you could also tokenize the [TinyStories](https://huggingface.co/datasets/roneneldan/TinyStories) dataset with `tinystories.py`.
 
 In principle we'd be ready to train the model right here. However the baseline CPU/fp32 reference code is so inefficient that it's not practical to train these models from scratch yet. Instead, we initialize with the GPT-2 weights released by OpenAI and just do finetuning. For that, we have to download the GPT-2 weights and save them as a checkpoint we can load in C:
 
diff --git a/dev/hellaswag.py b/dev/data/hellaswag.py
similarity index 92%
rename from dev/hellaswag.py
rename to dev/data/hellaswag.py
index 361a7757a..a1c14f591 100644
--- a/dev/hellaswag.py
+++ b/dev/data/hellaswag.py
@@ -29,14 +29,14 @@
 import requests
 import tiktoken
 from tqdm import tqdm
-
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
-
 from transformers import GPT2LMHeadModel
+from data_common import download_file
 
-DATA_CACHE_DIR = os.path.join("data", "hellaswag")
+# -----------------------------------------------------------------------------
+DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "hellaswag")
 
 hellaswags = {
     "train": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_train.jsonl",
@@ -46,21 +46,6 @@
 
 enc = tiktoken.get_encoding("gpt2")
 
-def download_file(url: str, fname: str, chunk_size=1024):
-    """Helper function to download a file from a given url"""
-    resp = requests.get(url, stream=True)
-    total = int(resp.headers.get("content-length", 0))
-    with open(fname, "wb") as file, tqdm(
-        desc=fname,
-        total=total,
-        unit="iB",
-        unit_scale=True,
-        unit_divisor=1024,
-    ) as bar:
-        for data in resp.iter_content(chunk_size=chunk_size):
-            size = file.write(data)
-            bar.update(size)
-
 def download(split):
     """Downloads HellaSwag DATA_CACHE_DIR"""
     os.makedirs(DATA_CACHE_DIR, exist_ok=True)
diff --git a/dev/mmlu.py b/dev/data/mmlu.py
similarity index 90%
rename from dev/mmlu.py
rename to dev/data/mmlu.py
index b61fe9324..bda8855b8 100644
--- a/dev/mmlu.py
+++ b/dev/data/mmlu.py
@@ -15,33 +15,18 @@
 import tiktoken
 import pandas as pd
 from tqdm import tqdm
-
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
-
 from transformers import GPT2LMHeadModel
+from data_common import download_file
 
-DATA_CACHE_DIR = os.path.join("data", "mmlu")
+# -----------------------------------------------------------------------------
+DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "mmlu")
 
 enc = tiktoken.get_encoding("gpt2")
 data_url = "https://people.eecs.berkeley.edu/~hendrycks/data.tar"
 
-def download_file(url: str, fname: str, chunk_size=1024):
-    """Helper function to download a file from a given url"""
-    resp = requests.get(url, stream=True)
-    total = int(resp.headers.get("content-length", 0))
-    with open(fname, "wb") as file, tqdm(
-        desc=fname,
-        total=total,
-        unit="iB",
-        unit_scale=True,
-        unit_divisor=1024,
-    ) as bar:
-        for data in resp.iter_content(chunk_size=chunk_size):
-            size = file.write(data)
-            bar.update(size)
-
 def download():
     """Downloads MMLU to DATA_CACHE_DIR"""
     os.makedirs(DATA_CACHE_DIR, exist_ok=True)
diff --git a/prepro_tinyshakespeare.py b/dev/data/tinyshakespeare.py
similarity index 70%
rename from prepro_tinyshakespeare.py
rename to dev/data/tinyshakespeare.py
index a5d562284..6d795aef7 100644
--- a/prepro_tinyshakespeare.py
+++ b/dev/data/tinyshakespeare.py
@@ -3,11 +3,11 @@
 - The download is from Github.
 - The tokenization is GPT-2 tokenizer with tiktoken
 
-The output is written to a newly created data/ folder.
+The output is written to a newly created tinyshakespeare/ folder.
 The script prints:
 
-Saved 32768 tokens to data/tiny_shakespeare_val.bin
-Saved 305260 tokens to data/tiny_shakespeare_train.bin
+Saved 32768 tokens to tinyshakespeare/tiny_shakespeare_val.bin
+Saved 305260 tokens to tinyshakespeare/tiny_shakespeare_train.bin
 
 And runs in a few seconds depending on your internet
 connection and computer. The .bin files are raw byte
@@ -15,36 +15,20 @@
 """
 
 import os
-import requests
-from tqdm import tqdm
-
 import tiktoken
 import numpy as np
+from data_common import download_file
+
+# -----------------------------------------------------------------------------
+DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "tinyshakespeare")
 
-DATA_CACHE_DIR = "data"
 enc = tiktoken.get_encoding("gpt2")
 encode = lambda s: enc.encode(s, allowed_special={'<|endoftext|>'})
 
-def download_file(url: str, fname: str, chunk_size=1024):
-    """Helper function to download a file from a given url"""
-    resp = requests.get(url, stream=True)
-    total = int(resp.headers.get("content-length", 0))
-    with open(fname, "wb") as file, tqdm(
-        desc=fname,
-        total=total,
-        unit="iB",
-        unit_scale=True,
-        unit_divisor=1024,
-    ) as bar:
-        for data in resp.iter_content(chunk_size=chunk_size):
-            size = file.write(data)
-            bar.update(size)
-
 def download():
     """Downloads the TinyShakespeare dataset to DATA_CACHE_DIR"""
     os.makedirs(DATA_CACHE_DIR, exist_ok=True)
-
-    # download the TinyStories dataset, unless it's already downloaded
+    # download the TinyShakespeare dataset, unless it's already downloaded
     data_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
     data_filename = os.path.join(DATA_CACHE_DIR, "tiny_shakespeare.txt")
     if not os.path.exists(data_filename):
@@ -54,7 +38,6 @@ def download():
         print(f"{data_filename} already exists, skipping download...")
 
 def tokenize():
-    eot = enc._special_tokens['<|endoftext|>'] # end of text token
     data_filename = os.path.join(DATA_CACHE_DIR, "tiny_shakespeare.txt")
     text = open(data_filename, 'r').read()
     # let's treat every person's statement in the dialog as a separate document
diff --git a/prepro_tinystories.py b/dev/data/tinystories.py
similarity index 82%
rename from prepro_tinystories.py
rename to dev/data/tinystories.py
index 8f2c1e8ad..628e5a7bb 100644
--- a/prepro_tinystories.py
+++ b/dev/data/tinystories.py
@@ -3,13 +3,13 @@
 - The download is from HuggingFace datasets.
 - The tokenization is GPT-2 tokenizer with tiktoken
 
-The output is written to a newly created data/ folder.
+The output is written to a newly created tinystories/ folder.
 The script prints:
 
 Tokenizing val split...
-Saved 19043638 tokens to data/TinyStories_val.bin
+Saved 19043638 tokens to tinystories/TinyStories_val.bin
 Tokenizing train split...
-Saved 925653391 tokens to data/TinyStories_train.bin
+Saved 925653391 tokens to tinystories/TinyStories_train.bin
 
 And runs in 1-2 minutes two depending on your internet
 connection and computer. The .bin files are raw byte
@@ -23,29 +23,16 @@
 import requests
 from tqdm import tqdm
 from concurrent.futures import ProcessPoolExecutor, as_completed
-
 import tiktoken
 import numpy as np
+from data_common import download_file
+
+# -----------------------------------------------------------------------------
+DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "tinystories")
 
-DATA_CACHE_DIR = "data"
 enc = tiktoken.get_encoding("gpt2")
 encode = lambda s: enc.encode_ordinary(s)
 
-def download_file(url: str, fname: str, chunk_size=1024):
-    """Helper function to download a file from a given url"""
-    resp = requests.get(url, stream=True)
-    total = int(resp.headers.get("content-length", 0))
-    with open(fname, "wb") as file, tqdm(
-        desc=fname,
-        total=total,
-        unit="iB",
-        unit_scale=True,
-        unit_divisor=1024,
-    ) as bar:
-        for data in resp.iter_content(chunk_size=chunk_size):
-            size = file.write(data)
-            bar.update(size)
-
 def download():
     """Downloads the TinyStories dataset to DATA_CACHE_DIR"""
     os.makedirs(DATA_CACHE_DIR, exist_ok=True)
@@ -70,11 +57,11 @@ def download():
 
     # print a single example just for debugging and such
     shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
-    with open(shard_filenames[0], "r") as f:
-        data = json.load(f)
     print("Download done.")
     print(f"Number of shards: {len(shard_filenames)}")
-    #print(f"Example story:\n{data[0]}")
+    # with open(shard_filenames[0], "r") as f:
+    #     data = json.load(f)
+    # print(f"Example story:\n{data[0]}")
 
 def process_shard(shard_index, shard_filename):
     with open(shard_filename, "r") as f:
diff --git a/train_gpt2.c b/train_gpt2.c
index 95c46ab86..8fdf46e4b 100644
--- a/train_gpt2.c
+++ b/train_gpt2.c
@@ -1100,10 +1100,10 @@ int main() {
     gpt2_build_from_checkpoint(&model, "gpt2_124M.bin");
 
     // build the DataLoaders from tokens files. for now use tiny_shakespeare if available, else tiny_stories
-    const char* tiny_stories_train = "data/TinyStories_train.bin";
-    const char* tiny_stories_val = "data/TinyStories_val.bin";
-    const char* tiny_shakespeare_train = "data/tiny_shakespeare_train.bin";
-    const char* tiny_shakespeare_val = "data/tiny_shakespeare_val.bin";
+    const char* tiny_stories_train = "dev/data/tinystories/TinyStories_train.bin";
+    const char* tiny_stories_val = "dev/data/tinystories/TinyStories_val.bin";
+    const char* tiny_shakespeare_train = "dev/data/tinyshakespeare/tiny_shakespeare_train.bin";
+    const char* tiny_shakespeare_val = "dev/data/tinyshakespeare/tiny_shakespeare_val.bin";
     const char* train_tokens = access(tiny_shakespeare_train, F_OK) != -1 ? tiny_shakespeare_train : tiny_stories_train;
     const char* val_tokens = access(tiny_shakespeare_val, F_OK) != -1 ? tiny_shakespeare_val : tiny_stories_val;
     int B = 4; // batch size 4 (i.e. 4 independent token sequences will be trained on)
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 1e8b54be2..0b69574e2 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2620,7 +2620,7 @@ void error_usage() {
     // default run = debugging run with TinyShakespeare
     // bigger run = train on TinyStories! e.g. val/sample less often, but sample more tokens, write to logfile
     fprintf(stderr, "Usage:   ./train_gpt2cu [options]\n");
-    fprintf(stderr, "Example: ./train_gpt2cu -i data/TinyStories -v 100 -s 100 -g 144 -o stories.log\n");
+    fprintf(stderr, "Example: ./train_gpt2cu -i dev/data/tinystories/TinyStories -v 100 -s 100 -g 144 -o stories.log\n");
     fprintf(stderr, "Options:\n");
     fprintf(stderr, "  -i <string> input dataset prefix (default = data/tiny_shakespeare)\n");
     fprintf(stderr, "  -e <string> input model filename (default = gpt2_124M_bf16.bin)\n");
@@ -2648,7 +2648,7 @@ int main(int argc, char *argv[]) {
     multi_gpu_config = multi_gpu_config_init(&argc, &argv);
 
     // read in the (optional) command line arguments
-    const char* input_dataset_prefix = "data/tiny_shakespeare"; // or e.g. data/TinyStories
+    const char* input_dataset_prefix = "dev/data/tinyshakespeare/tiny_shakespeare"; // or e.g. data/TinyStories
     const char* load_filename = "gpt2_124M_bf16.bin"; // bf16 weights of the model
     const char* output_log_file = NULL;
     int B = 4; // batch size
diff --git a/train_gpt2.py b/train_gpt2.py
index ab1c3e44d..ceda8ae5e 100644
--- a/train_gpt2.py
+++ b/train_gpt2.py
@@ -394,7 +394,7 @@ def print0(*args, **kwargs):
     # if you'd like to e.g. time the forward pass only, call this script as:
     # python train_gpt2.py --inference_only 1 --write_tensors 0 --sequence_length 1024
     parser = argparse.ArgumentParser()
-    parser.add_argument("--input_bin", type=str, default="data/tiny_shakespeare_val.bin", help="input .bin to train on")
+    parser.add_argument("--input_bin", type=str, default="dev/data/tinyshakespeare/tiny_shakespeare_val.bin", help="input .bin to train on")
     parser.add_argument("--model", type=str, default="gpt2", help="gpt2|gpt2-medium|gpt2-large|gpt2-xl")
     parser.add_argument("--write_tensors", type=int, default=1, help="write tensors to disk")
     parser.add_argument("--inference_only", type=int, default=0, help="only run inference")
diff --git a/train_gpt2_fp32.cu b/train_gpt2_fp32.cu
index 178288f33..d2cf53b43 100644
--- a/train_gpt2_fp32.cu
+++ b/train_gpt2_fp32.cu
@@ -1595,7 +1595,7 @@ void error_usage() {
     // default run = debugging run with TinyShakespeare
     // bigger run = train on TinyStories! e.g. val/sample less often, but sample more tokens, write to logfile
     fprintf(stderr, "Usage:   ./train_gpt2fp32cu [options]\n");
-    fprintf(stderr, "Example: ./train_gpt2fp32cu -i data/TinyStories -v 100 -s 100 -g 144 -o stories.log\n");
+    fprintf(stderr, "Example: ./train_gpt2fp32cu -i dev/data/tinystories/TinyStories -v 100 -s 100 -g 144 -o stories.log\n");
     fprintf(stderr, "Options:\n");
     fprintf(stderr, "  -i <string> input dataset prefix (default = data/tiny_shakespeare)\n");
     fprintf(stderr, "  -o <string> output log file (default = NULL)\n");
@@ -1614,7 +1614,7 @@ void error_usage() {
 int main(int argc, char *argv[]) {
 
     // read in the (optional) command line arguments
-    const char* input_dataset_prefix = "data/tiny_shakespeare"; // or e.g. data/TinyStories
+    const char* input_dataset_prefix = "dev/data/tinyshakespeare/tiny_shakespeare"; // or e.g. data/TinyStories
     const char* output_log_file = NULL;
     int B = 4; // batch size
     int T = 1024; // sequence length max

From f671cf92880ec52e5408c27a6b5767e03b708345 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Mon, 20 May 2024 23:02:41 +0000
Subject: [PATCH 115/172] more changes, trying to help people out because when
 this merges to master it will brick everyone's code...

---
 train_gpt2.c  | 3 +++
 train_gpt2.cu | 2 +-
 train_gpt2.py | 6 +++++-
 utils.h       | 3 ++-
 4 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/train_gpt2.c b/train_gpt2.c
index 8fdf46e4b..41ec3147e 100644
--- a/train_gpt2.c
+++ b/train_gpt2.c
@@ -1021,6 +1021,9 @@ void dataloader_init(DataLoader *loader, const char* filename, int B, int T) {
     loader->tokens_file = fopen(filename, "rb");
     if (loader->tokens_file == NULL) {
         printf("Error opening tokens file\n");
+        printf("--> HINT: the data directory may have moved recently from data/ to dev/data/(dataset)/");
+        printf("--> HINT: refer again to the README file and possibly re-run the dataset prepro script.");
+        printf("--> HINT: example: re-run `python dev/data/tinyshakespeare.py`");
         exit(1);
     }
 
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 0b69574e2..6584a4cd5 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2622,7 +2622,7 @@ void error_usage() {
     fprintf(stderr, "Usage:   ./train_gpt2cu [options]\n");
     fprintf(stderr, "Example: ./train_gpt2cu -i dev/data/tinystories/TinyStories -v 100 -s 100 -g 144 -o stories.log\n");
     fprintf(stderr, "Options:\n");
-    fprintf(stderr, "  -i <string> input dataset prefix (default = data/tiny_shakespeare)\n");
+    fprintf(stderr, "  -i <string> input dataset prefix (default = dev/data/tinyshakespeare/tiny_shakespeare)\n");
     fprintf(stderr, "  -e <string> input model filename (default = gpt2_124M_bf16.bin)\n");
     fprintf(stderr, "  -o <string> output log file (default = NULL)\n");
     fprintf(stderr, "  -b <int>    (per-GPU, micro) batch size B (default = 4)\n");
diff --git a/train_gpt2.py b/train_gpt2.py
index ceda8ae5e..4d61e68cd 100644
--- a/train_gpt2.py
+++ b/train_gpt2.py
@@ -495,7 +495,11 @@ def print0(*args, **kwargs):
 
     # load the tokens
     # note we're using val by default instead of train split just because it is smaller/faster
-    assert os.path.isfile(args.input_bin)
+    if not os.path.isfile(args.input_bin):
+        print0(f"ERROR: input .bin file not found: {args.input_bin}")
+        print0("---> HINT: try to re-run the data prepro script. these recently moved to dev/data")
+        print0("---> HINT: for example re-run: `python dev/data/tinyshakespeare.py`, then re-try")
+        exit(1)
     print0(f"loading cached tokens in {args.input_bin}")
     with open(args.input_bin, "rb") as f:
         tokens = np.frombuffer(f.read(), dtype=np.int32)
diff --git a/utils.h b/utils.h
index 5d594cb6c..a40de67e3 100644
--- a/utils.h
+++ b/utils.h
@@ -24,7 +24,8 @@ FILE *fopen_check(const char *path, const char *mode, const char *file, int line
         fprintf(stderr, "  Line: %d\n", line);
         fprintf(stderr, "  Path: %s\n", path);
         fprintf(stderr, "  Mode: %s\n", mode);
-        fprintf(stderr, "---> HINT: try to re-run `python train_gpt2.py`\n");
+        fprintf(stderr, "---> HINT 1: dataset files/code have moved to dev/data recently (May 20, 2024). You may have to mv them from the legacy data/ dir to dev/data/(dataset), or re-run the data preprocessing script. Refer back to the main README\n");
+        fprintf(stderr, "---> HINT 2: possibly try to re-run `python train_gpt2.py`\n");
         exit(EXIT_FAILURE);
     }
     return fp;

From 71774b3c3d83f651e143f04cc2ab79b9b757f2eb Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Mon, 20 May 2024 23:10:49 +0000
Subject: [PATCH 116/172] oops forgot to include data_common.py

---
 dev/data/data_common.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 dev/data/data_common.py

diff --git a/dev/data/data_common.py b/dev/data/data_common.py
new file mode 100644
index 000000000..d6b71ecbc
--- /dev/null
+++ b/dev/data/data_common.py
@@ -0,0 +1,21 @@
+"""
+Common utilities for the datasets
+"""
+
+import requests
+from tqdm import tqdm
+
+def download_file(url: str, fname: str, chunk_size=1024):
+    """Helper function to download a file from a given url"""
+    resp = requests.get(url, stream=True)
+    total = int(resp.headers.get("content-length", 0))
+    with open(fname, "wb") as file, tqdm(
+        desc=fname,
+        total=total,
+        unit="iB",
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as bar:
+        for data in resp.iter_content(chunk_size=chunk_size):
+            size = file.write(data)
+            bar.update(size)

From 7d11b7996c86b79095e6a9967010f761d0c6f363 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Mon, 20 May 2024 23:13:42 +0000
Subject: [PATCH 117/172] i also forgot to include the readme file for the new
 dev/data dir

---
 dev/data/README.md | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 dev/data/README.md

diff --git a/dev/data/README.md b/dev/data/README.md
new file mode 100644
index 000000000..b13675c1e
--- /dev/null
+++ b/dev/data/README.md
@@ -0,0 +1,8 @@
+# dev/data organization
+
+The idea is that each dataset has a .py file here in the root of `dev/data`, and each dataset then creates a directory here, and writes and caches anything inside that directory. So for example:
+
+- running `python tinystories.py` will create a directory `tinystories` with its .bin files inside it
+- running `python tinyshakespeare.py` will create a directory `tinyshakespeare` with its .bin files inside it
+
+And so on. This way we can nicely organize multiple datasets here, share common utilities between them, and then point the .py/.c code in the root of the project accordingly to these.

From bced34d71e0d5006da71b62861de4c58e82a66ed Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Mon, 20 May 2024 23:15:44 +0000
Subject: [PATCH 118/172] adjust gitignore

---
 .gitignore | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index f60885e23..ba0de8b90 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,12 +2,18 @@
 .vscode
 .venv
 
-# data files
-data
-
 # .bin files generated by Python
 *.bin
 
+# data directories
+dev/data/__pycache__/
+dev/data/fineweb.py
+dev/data/fineweb/
+dev/data/hellaswag/
+dev/data/mmlu/
+dev/data/tinyshakespeare/
+dev/data/tinystories/
+
 # binaries
 test_gpt2
 test_gpt2cu
@@ -22,6 +28,7 @@ dev/cuda/classifier_fused
 dev/cuda/adamw
 dev/cuda/matmul_backward_bias
 dev/cuda/nccl_all_reduce
+dev/cuda/global_norm
 *.obj
 *.exe
 *.o

From c51cd70cfbfd2ef6f8ef4016b7f1a58bd741b314 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Mon, 20 May 2024 23:22:31 +0000
Subject: [PATCH 119/172] fix a slip in gitignore, i think i am getting tired
 today

---
 .gitignore | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index ba0de8b90..05391b6d1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,7 +7,6 @@
 
 # data directories
 dev/data/__pycache__/
-dev/data/fineweb.py
 dev/data/fineweb/
 dev/data/hellaswag/
 dev/data/mmlu/

From 4bbd01fe120e8b45343154de1e2b7d886c2b0c1e Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Tue, 21 May 2024 00:16:07 +0000
Subject: [PATCH 120/172] add fineweb, and add the first version of a new
 write_shard function that contains a header properly

---
 dev/data/data_common.py | 22 +++++++++++
 dev/data/fineweb.py     | 83 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 105 insertions(+)
 create mode 100644 dev/data/fineweb.py

diff --git a/dev/data/data_common.py b/dev/data/data_common.py
index d6b71ecbc..c3147de76 100644
--- a/dev/data/data_common.py
+++ b/dev/data/data_common.py
@@ -4,6 +4,8 @@
 
 import requests
 from tqdm import tqdm
+import numpy as np
+
 
 def download_file(url: str, fname: str, chunk_size=1024):
     """Helper function to download a file from a given url"""
@@ -19,3 +21,23 @@ def download_file(url: str, fname: str, chunk_size=1024):
         for data in resp.iter_content(chunk_size=chunk_size):
             size = file.write(data)
             bar.update(size)
+
+
+def write_shard(filename, toks):
+    """Saves token data as a .bin file, for reading in C"""
+    assert len(toks) < 2**31, "token count too large" # ~2.1B tokens
+    # construct the header
+    header = np.zeros(256, dtype=np.int32)
+    header[0] = 20240520 # magic
+    header[1] = 1 # version
+    header[2] = len(toks) # number of tokens after the 256*4 bytes of header (each 2 bytes as uint16)
+    # validate that no token exceeds a uint16
+    maxtok = 2**16
+    assert all(0 <= t < maxtok for t in toks), "token dictionary too large for uint16"
+    # construct the tokens
+    toks_np = np.array(toks, dtype=np.uint16)
+    # write to file
+    print(f"writing {filename}")
+    with open(filename, "wb") as f:
+        f.write(header.tobytes())
+        f.write(toks_np.tobytes())
diff --git a/dev/data/fineweb.py b/dev/data/fineweb.py
new file mode 100644
index 000000000..8b6ef4bfa
--- /dev/null
+++ b/dev/data/fineweb.py
@@ -0,0 +1,83 @@
+"""
+FineWeb dataset (for srs pretraining)
+https://huggingface.co/datasets/HuggingFaceFW/fineweb
+
+example doc to highlight the structure of the dataset:
+{
+  "text": "Posted by mattsmith on 20th April 2012\nStraight from...",
+  "id": "<urn:uuid:d853d453-196e-4488-a411-efc2b26c40d2>",
+  "dump": "CC-MAIN-2013-20",
+  "url": "http://nleastchatter.com/philliesphandom/tag/freddy-galvis/",
+  "date": "2013-05-18T07:24:47Z",
+  "file_path": "s3://commoncrawl/long.../path.../file.gz",
+  "language": "en",
+  "language_score": 0.9185474514961243,
+  "token_count": 594
+}
+"""
+import os
+import argparse
+import multiprocessing as mp
+import numpy as np
+import tiktoken
+# from huggingface_hub import snapshot_download
+from datasets import load_dataset
+from tqdm import tqdm
+import argparse
+
+from data_common import write_shard
+# ------------------------------------------
+
+parser = argparse.ArgumentParser(description="FineWeb dataset preprocessing")
+parser.add_argument("-s", "--shard_size", type=int, default=10**9, help="Size of each shard in tokens")
+args = parser.parse_args()
+
+# create the cache directory if it doesn't exist yet
+DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "fineweb10B")
+os.makedirs(DATA_CACHE_DIR, exist_ok=True)
+
+# todo is this needed? or just the load_dataset below?
+# download 10B Tokens sample (~28GB on disk)
+# folder = snapshot_download(
+#     "HuggingFaceFW/fineweb",
+#     repo_type="dataset",
+#     local_dir="./data/fineweb/",
+#     allow_patterns="sample/10BT/*"
+# )
+fw = load_dataset("HuggingFaceFW/fineweb", name="sample-10BT", split="train")
+
+# init the tokenizer
+enc = tiktoken.get_encoding("gpt2")
+eot = enc._special_tokens['<|endoftext|>'] # end of text token
+
+# helper functions
+def tokenize(doc):
+    return enc.encode_ordinary(doc["text"])
+
+# main loop write files
+pool = mp.Pool()
+shard_index = 0
+all_tokens = []
+progress_bar = None
+for tokens in pool.imap(tokenize, fw):
+
+    # record the tokens and make sure to separate documents
+    all_tokens.append(eot)
+    all_tokens.extend(tokens)
+
+    # update progress bar
+    if progress_bar is None:
+        progress_bar = tqdm(total=args.shard_size, unit="tokens", desc=f"Shard {shard_index}")
+    progress_bar.update(len(tokens))
+
+    # if we reach shard_size tokens, write shard to disk
+    if len(all_tokens) >= args.shard_size:
+        filename = os.path.join(DATA_CACHE_DIR, f"fineweb_{shard_index:06d}.bin")
+        write_tokens = all_tokens[:args.shard_size]
+        rest_tokens = all_tokens[args.shard_size:]
+        write_shard(filename, write_tokens)
+        shard_index += 1
+        progress_bar = None
+        # note: create a copy so Python can free the all_tokens memory above
+        # the list rest_tokens is expected to be very small
+        all_tokens = [t for t in rest_tokens]

From b5e75dde8e8d20b13177b060f3ed364bbe50eb12 Mon Sep 17 00:00:00 2001
From: ademeure <arundemeure@gmail.com>
Date: Tue, 21 May 2024 15:57:07 +0100
Subject: [PATCH 121/172] Fully deterministic encoder backward kernels for
 train_gpt2.cu

---
 profile_gpt2.cu |   2 +-
 test_gpt2.cu    |   2 +-
 train_gpt2.cu   | 245 ++++++++++++++++++++++++++++++++++++------------
 3 files changed, 189 insertions(+), 60 deletions(-)

diff --git a/profile_gpt2.cu b/profile_gpt2.cu
index f2ac0e84c..f79e9ada4 100644
--- a/profile_gpt2.cu
+++ b/profile_gpt2.cu
@@ -54,7 +54,7 @@ int main(int argc, char *argv[]) {
     // do a training step
     gpt2_forward(&model, x, y, B, T);
     gpt2_zero_grad(&model);
-    gpt2_backward(&model);
+    gpt2_backward(&model, x);
     gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1.f, 1, &multi_gpu_config);
     cudaCheck(cudaDeviceSynchronize()); // finish all CUDA work to get correct precise timings
 
diff --git a/test_gpt2.cu b/test_gpt2.cu
index 50a291f18..d06734507 100644
--- a/test_gpt2.cu
+++ b/test_gpt2.cu
@@ -203,7 +203,7 @@ int main(int argc, char *argv[]) {
         clock_gettime(CLOCK_MONOTONIC, &start);
         gpt2_forward(&model, x, y, B, T);
         gpt2_zero_grad(&model);
-        gpt2_backward(&model);
+        gpt2_backward(&model, x);
         clock_gettime(CLOCK_MONOTONIC, &end);
         double time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
 
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 1e8b54be2..899293f75 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -38,6 +38,9 @@ This reads & runs in fp32, B=4, T=64, LR=1e-4, val/sample never (200),
 #include <stdio.h>
 #include <stdarg.h>
 #include <string>
+#include <vector>
+#include <functional>
+#include <unordered_map>
 // GPU / CUDA related
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
@@ -532,50 +535,108 @@ __global__ void encoder_forward_kernel3(floatX* out,
     store128(out_btc, packed_out);
 }
 
-template <typename T>
-__device__ void atomicStochasticAdd(T* address, float val0, float val1, unsigned int seed) {
-    static_assert(sizeof(T) == 2, "Only 16-bit atomicStochasticAdd supported.");
-    float2 val = make_float2(val0, val1);
-    unsigned int* address_as_uint = (unsigned int*)address;
-    unsigned int old = *address_as_uint, assumed;
-    unsigned int random = Get2dNoiseUint(threadIdx.x, blockIdx.x, seed);
-    do {
-        assumed = old;
-        float2 new_fp32 = make_float2((float)(reinterpret_cast<T*>(&old)[0]) + val.x,
-                                      (float)(reinterpret_cast<T*>(&old)[1]) + val.y);
-        T new_rounded[2];
-        stochastic_rounding(new_fp32.x, &new_rounded[0], random);
-        stochastic_rounding(new_fp32.y, &new_rounded[1], random >> 16);
-        old = atomicCAS(address_as_uint, assumed, *(unsigned int*)&new_rounded);
-    } while (assumed != old);
-}
-__device__ void atomicStochasticAdd(float* address, float val0, float val1, unsigned int seed) {
-    atomicAdd(address, val0);
-    atomicAdd(address + 1, val1);
-}
-
-__global__ void encoder_backward_kernel(floatX* dwte, floatX* dwpe,
-                                        const floatX* dout, const int* inp,
-                                        int B, int T, int C, unsigned int seed) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int N = B * T * C;
-    idx *= 2; // 2 elements per thread
-    if (idx >= N) { return; }
+template <int BLOCK_SIZE=256>
+__global__ void wte_backward_kernel(floatX* dwte,
+                                    const int4* bucket_info, const int* workload_indices, const floatX* dout, const int* inp,
+                                    unsigned int seed, int B, int T, int C) {
+    // In order to be deterministic, we preprocess the inputs on the cpu into "buckets"
+    // Each bucket corresponds to (WARP_SIZE * x128::size) channels for a single vocabulary token
+    // Each thread handles x128::size channels, e.g. 256 per warp for BF16
+    // Each block handles (BLOCK_SIZE / WARP_SIZE) elements in a single bucket in parallel
+    // If a bucket has less than 8 elements, some warps will return immediately
+    // If a bucket has more than 8 elements, we will loop over all of them
+    // The buckets are sorted on the CPU so the largest buckets start 1st
+    int bucket = blockIdx.x;
+    int warp_id = threadIdx.x / WARP_SIZE;
+    int lane_id = threadIdx.x % WARP_SIZE;
+    int c_per_warp = WARP_SIZE * x128::size;
+
+    int bucket_start_idx = bucket_info[bucket].x;
+    int bucket_size = bucket_info[bucket].y;
+    int bucket_ix = bucket_info[bucket].z;
+    int c = bucket_info[bucket].w * c_per_warp + (lane_id * x128::size);
+
+    // Each thread handles "x128::size" channels, so at fp8, each warp would handle 512 channels
+    // If C is not a multiple of this (e.g. 768), some buckets/c_groups cannot use the entire warp
+    if (c >= C) { return; }
+    // Exit early if this is a small bucket and this warp doesn't have any items to process
+    if (warp_id >= bucket_size) { return; }
+
+    float accum[x128::size] = {0.0f};
+    __shared__ float accum_shared[x128::size * BLOCK_SIZE];
+
+    for(int item = warp_id; item < bucket_size; item += BLOCK_SIZE/WARP_SIZE) {
+        int bt = workload_indices[bucket_start_idx + item];
+        int b = bt / T;
+        int t = bt % T;
+
+        const floatX* dout_btc = dout + b * T * C + t * C + c;
+        x128 packed_inp1 = load128cs(dout_btc);
+        for (int k = 0; k < packed_inp1.size; k++) {
+            accum[k] += (float)packed_inp1[k];
+        }
+    }
 
-    int bt = idx / C;
-    int b = bt / T;
-    int t = bt % T;
-    int c = idx % C;
+    if (warp_id != 0) {
+        // we accumulate into warp 0, so only the other warps need to write to shared memory
+        for (int k = 0; k < x128::size; k++) {
+            accum_shared[threadIdx.x + k * BLOCK_SIZE] = accum[k];
+        }
+        return; // only warp 0 is needed after writing to shared memory
+    }
 
-    int ix = inp[b * T + t];
+    // Read dwte for warp 0 even if other warps are not finished yet to maximise latency tolerance
+    floatX* dwte_ix = dwte + bucket_ix * C + c;
+    x128 packed_in_out = load128(dwte_ix);
 
-    const floatX* dout_btc = dout + b * T * C + t * C + c;
-    floatX* dwte_ix = dwte + ix * C + c;
-    floatX* dwpe_tc = dwpe + t * C + c;
+    // note: threads which have returned are considered synchronised by CUDA so no risk of deadlock
+    __syncthreads();
 
-    float2 dout_data = make_float2(dout_btc[0], dout_btc[1]);
-    atomicStochasticAdd(dwte_ix, dout_data.x, dout_data.y, seed);
-    atomicStochasticAdd(dwpe_tc, dout_data.x, dout_data.y, seed ^ 0xFFFFFFFF);
+    // Accumulate into warp 0's registers by reading the values of the other warps in shared memory
+    for (int i = threadIdx.x+WARP_SIZE; i < min(BLOCK_SIZE, bucket_size*WARP_SIZE); i += WARP_SIZE) {
+        for (int k = 0; k < x128::size; k++) {
+            accum[k] += accum_shared[i + k * BLOCK_SIZE];
+        }
+    }
+
+    // Add the result to dwte and write back to global memory (read-modify-write)
+    for (unsigned int k = 0; k < x128::size; k++) {
+        // We use stochastic rounding to go from FP32 to BF16 but the seed should be deterministic
+        stochastic_rounding(accum[k] + (float)packed_in_out[k], &packed_in_out[k], seed + k);
+    }
+    store128(dwte_ix, packed_in_out);
+}
+
+__global__ void wpe_backward_kernel(floatX* dwpe,
+                                    const floatX* dout, const int* inp,
+                                    int B, int T, int C, unsigned int seed) {
+    // Each thread handles x128::size "channel positions", e.g. 256 per warp for BF16
+    // For gpt2-124M BF16, C=768 and T=1024, so 3 warps per channel and 3072 warps in total
+    // For each "channel position" we sum the gradients for every batch at that C/T element
+    // This way each dwte element is only updated once, and the kernel is fully deterministic!
+    // The previous kernel was not deterministic, as batches were aggregated with atomicAdd
+    int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
+    if (idx >= T * C) { return; }
+
+    // if C is not a multiple of WARP_SIZE*x128::size, it's OK for some warps to handle multiple t
+    int t = idx / C;
+    int c = idx % C;
+    float accum[x128::size] = {0.0f};
+
+    for (int b = 0; b < B; b++) {
+        x128 packed_dout = load128cs(dout + (b * T * C) + (t * C) + c); // will never be read again
+        for (int k = 0; k < x128::size; k++) {
+            accum[k] += (float)packed_dout[k];
+        }
+    }
+
+    floatX* dwpe_tc = dwpe + (t * C) + c;
+    x128 packed_dwpe = load128(dwpe_tc);
+    for (unsigned int k = 0; k < x128::size; k++) {
+        // We use stochastic rounding to go from FP32 to BF16 but the seed should be deterministic
+        stochastic_rounding(accum[k] + (float)packed_dwpe[k], &packed_dwpe[k], seed + k);
+    }
+    store128(dwpe_tc, packed_dwpe);
 }
 
 __global__ void layernorm_forward_kernel3(floatX* __restrict__ out, floatX* __restrict__ mean, floatX* __restrict__ rstd,
@@ -783,10 +844,9 @@ __global__ void softmax_forward_kernel5(floatX* out, float inv_temperature, cons
     // directly autoregressive, so we only compute the lower triangular part
     // uses the online softmax algorithm
     assert(T % 4  == 0);
-    const int warp_size = 32;
-    int lane_id = threadIdx.x % warp_size;
-    int warp_id = threadIdx.x / warp_size;
-    int num_warps = blockDim.x / warp_size;
+    int lane_id = threadIdx.x % WARP_SIZE;
+    int warp_id = threadIdx.x / WARP_SIZE;
+    int num_warps = blockDim.x / WARP_SIZE;
 
     // micro-optimization: we iterate backwards so that
     // after the softmax backward operation completes, the cache retains the
@@ -809,7 +869,7 @@ __global__ void softmax_forward_kernel5(floatX* out, float inv_temperature, cons
     float sumval = 0.0f;
 
     const floatX* x_aligned = reinterpret_cast<const floatX*>(__builtin_assume_aligned(x, 16));
-    for (int i = lane_id; i < pos_by_4; i += warp_size) {
+    for (int i = lane_id; i < pos_by_4; i += WARP_SIZE) {
         float regarray[4];
         for (int k = 0; k < 4; ++k) {
             regarray[k] = (float)x_aligned[4*i + k];
@@ -838,7 +898,7 @@ __global__ void softmax_forward_kernel5(floatX* out, float inv_temperature, cons
     float norm = 1.f / sum;
 
     // divide the whole row by the sum
-    for (int i = lane_id; i <= own_pos; i += warp_size) {
+    for (int i = lane_id; i <= own_pos; i += WARP_SIZE) {
         // recalculation is faster than doing the round-trip through memory.
         float ev = expf(inv_temperature * ((float)__ldcs(x + i) - global_maxval));
         __stcs(out + idx * T + i, (floatX)(ev * norm));
@@ -1354,14 +1414,70 @@ void encoder_forward(floatX* out,
     cudaCheck(cudaGetLastError());
 }
 
-void encoder_backward(floatX* dwte, floatX* dwpe,
-                    const floatX* dout, const int* inp,
-                    int B, int T, int C, unsigned int seed) {
+// Fully deterministic (see comments in wte_backward_kernel and wpe_backward_kernel for more details)
+void encoder_backward(floatX* dwte, floatX* dwpe, floatX* scratch, // gpu outputs & scratch
+                      int* workload_indices, int4* bucket_info,    // cpu scratch buffers
+                      const floatX* dout, const int* inp, const int* inputs_cpu, // cpu/gpu inputs
+                      int B, int T, int C, unsigned int seed) {
     NVTX_RANGE_FN();
-    const int N = B * T * C;
+
+    // Launch wpe kernel first (so it runs on the GPU in parallel with the CPU pre-processing for wte)
     const int block_size = 256;
-    const int grid_size = CEIL_DIV(N, block_size * 2); // each thread handles 2 elements
-    encoder_backward_kernel<<<grid_size, block_size>>>(dwte, dwpe, dout, inp, B, T, C, seed);
+    const int N = T * C / x128::size;
+    const int grid_size = CEIL_DIV(N, block_size);
+    wpe_backward_kernel<<<grid_size, block_size, 0>>>(dwpe, dout, inp, B, T, C, seed);
+
+    // check the GPU scratch buffer is large enough to hold the bucket info and workload indices
+    // todo - this is trivially true given hardcoded scratch buffer size here, is this useful?
+    int num_c_groups = CEIL_DIV(C, x128::size * WARP_SIZE);
+    assert(B*T*num_c_groups * (sizeof(int4)+sizeof(int)) <= B*T*3*C * sizeof(floatX));
+
+    // Step 1: Sort inputs into buckets
+    int total_items = 0;
+    std::unordered_map<uint64_t, std::vector<uint64_t>> buckets;
+    for (uint64_t bt = 0; bt < B * T; bt++) {
+        for (uint64_t c_group = 0; c_group < num_c_groups; c_group++) {
+            // todo - passing c_group/inputs_cpu[bt] in data to avoid a second hash lookup is a bit hacky
+            uint64_t data = bt + (c_group<<32ULL) + ((uint64_t)inputs_cpu[bt]<<42ULL);
+            buckets[c_group + num_c_groups * inputs_cpu[bt]].push_back(data);
+            total_items++;
+        }
+    }
+
+    // Step 2: Sort buckets by size in descending order
+    // this is so the largest buckets are processed first by the GPU
+    // otherwise, if they started late, they would still be running with the rest of the GPU idle
+    std::vector<std::pair<uint64_t, std::vector<uint64_t>>> sortedBuckets(buckets.begin(), buckets.end());
+    std::sort(sortedBuckets.begin(), sortedBuckets.end(), // ugly because we don't have a typedef for the std::pair
+              [](const std::pair<uint64_t, std::vector<uint64_t>>& a, const std::pair<uint64_t, std::vector<uint64_t>>& b) {
+                  return a.second.size() > b.second.size();
+              });
+
+    int num_buckets = buckets.size();
+    int bucket_index = 0;
+    int workload_index = 0;
+    for (const auto& bucket : sortedBuckets) {
+        bucket_info[bucket_index].x = workload_index; // bucket start
+        bucket_info[bucket_index].y = bucket.second.size(); // bucket size
+        bucket_info[bucket_index].z = (bucket.second[0] >> 42ULL) & ((1ULL<<20ULL)-1); // bucket ix
+        bucket_info[bucket_index].w = (bucket.second[0] >> 32ULL) & ((1ULL<<10ULL)-1); // bucket c
+
+        for (uint64_t idx : bucket.second) {
+            workload_indices[workload_index++] = (int)(idx & ((1ULL<<31ULL)-1ULL));
+        }
+        bucket_index++;
+    }
+
+    // Step 3: Copy data from host to device (async until the last one to avoid synchronising CPU/GPU twice)
+    // todo - could use CUDA events (even without streams) to avoid CPU/GPU synchronisation completely
+    int4* d_bucket_info = (int4*)scratch;
+    int*  d_workload_indices = (int*)(scratch + B*T*num_c_groups * sizeof(int4));
+    cudaMemcpyAsync(d_bucket_info, bucket_info, num_buckets * sizeof(int4), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_workload_indices, workload_indices, total_items * sizeof(int), cudaMemcpyHostToDevice);
+
+    // Launch wte kernel
+    // todo - profile block sizes on more content (depends on number of buckets and on GPU?)
+    wte_backward_kernel<256><<<num_buckets, 256>>>(dwte, d_bucket_info, d_workload_indices, dout, inp, seed, B, T, C);
     cudaCheck(cudaGetLastError());
 }
 
@@ -1947,6 +2063,9 @@ typedef struct {
     unsigned long long rng_state; // the RNG state for seeding stochastic rounding etc.
     int use_master_weights;
     int recompute;
+    // todo - if other functions need cpu scratch buffers in the future, reuse as generic scratch?
+    int* workload_indices; // encoder_backward, B*T*num_c_groups (int)
+    int4* bucket_info;     // encoder_backward, B*T*num_c_groups (int4) - size for worst case
 } GPT2;
 
 void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
@@ -2022,6 +2141,8 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
     model->inputs = NULL;
     model->targets = NULL;
     model->cpu_losses = NULL;
+    model->workload_indices = NULL;
+    model->bucket_info = NULL;
     model->batch_size = 0;
     model->seq_len = 0;
     model->mean_loss = -1.0f; // -1.0f will designate no loss
@@ -2195,7 +2316,7 @@ void gpt2_zero_grad(GPT2 *model) {
     }
 }
 
-void gpt2_backward(GPT2 *model) {
+void gpt2_backward(GPT2 *model, int* inputs) {
     NVTX_RANGE_FN();
     // double check we forwarded previously, with targets
     if (model->mean_loss == -1.0f) {
@@ -2221,6 +2342,11 @@ void gpt2_backward(GPT2 *model) {
         model->grads_acts_memory = malloc_and_point_backward(&model->grads_acts, bw_act_sizes);
         // init gradients of parameters and activations to zero
         gpt2_zero_grad(model);
+        // initialise cpu scratch buffers for encoder backward
+        size_t num_c_groups = model->config.channels / (WARP_SIZE * x128::size);
+        assert((size_t)(model->batch_size * model->seq_len) * num_c_groups < (1ULL<<31ULL)); // todo - maybe an issue for llama3-400B(?)
+        model->workload_indices = (int*)mallocCheck(sizeof(int) * model->batch_size * model->seq_len * num_c_groups);
+        model->bucket_info = (int4*)mallocCheck(sizeof(int4) * model->batch_size * model->seq_len * num_c_groups);
     }
 
     // convenience shortcuts, size_t instead of int so that pointer arithmetics don't overflow
@@ -2241,7 +2367,8 @@ void gpt2_backward(GPT2 *model) {
     cudaCheck(cudaMemset(model->grads_acts.residual3, 0, B * T * C * sizeof(floatX)));
 
     // re-use the output buffer of the forward pass as a scratchpad during backward pass
-    float* scratchF = (float*)acts.output;
+    float*  scratchF = (float*)acts.output;
+    floatX* scratchX = (floatX*)acts.output;
 
     // we kick off the chain rule by filling in dlosses with 1.0f/(B*T)
     // this was done in the fused classifier kernel as last step of forward pass
@@ -2323,7 +2450,6 @@ void gpt2_backward(GPT2 *model) {
         floatX* buffer_a = l_atty;
         floatX* buffer_b = l_fch;        // this is B x T x 4C, so even larger than what we need
         floatX* dl_preatt = (floatX*)grads_acts.preatt; // dedicated scratchpad allocation
-        floatX* scratchX =  (floatX*)acts.output;
         attention_backward(dl_bt4c, buffer_b, dl_preatt, scratchX, buffer_a, dl_btc, l_qkvr, l_att, B, T, C, NH);
         #endif
 
@@ -2332,7 +2458,8 @@ void gpt2_backward(GPT2 *model) {
         // layernorm backward does += to dresidual, so it correctly accumulates gradient for the Attention block above
         layernorm_backward(dresidual, dl_ln1w, dl_ln1b, scratchF, dl_btc, residual, l_ln1w, l_ln1_mean, l_ln1_rstd, B, T, C);
     }
-    encoder_backward(grads.wte, grads.wpe, dresidual, model->inputs, B, T, C, random_u32(&model->rng_state));
+    encoder_backward(grads.wte, grads.wpe, scratchX, model->workload_indices, model->bucket_info,
+                     dresidual, model->inputs, inputs, B, T, C, random_u32(&model->rng_state));
 }
 
 // Compute a mean of a single CPU value across all GPU processes. No-op when multi-GPU is disabled.
@@ -2448,6 +2575,8 @@ void gpt2_free(GPT2 *model) {
     cudaCheck(cudaFree(model->inputs));
     cudaCheck(cudaFree(model->targets));
     cudaFreeHost(model->cpu_losses);
+    free(model->workload_indices);
+    free(model->bucket_info);
 }
 
 // ----------------------------------------------------------------------------
@@ -2477,7 +2606,7 @@ void common_free(GPT2 &model) {
     cudaCheck(cudaFree(cublaslt_workspace));
     cublasCheck(cublasDestroy(cublas_handle));
     cublasCheck(cublasLtDestroy(cublaslt_handle));
-    create_cudnn();
+    destroy_cudnn();
 }
 
 #ifndef TESTING
@@ -2880,7 +3009,7 @@ int main(int argc, char *argv[]) {
             gpt2_forward(&model, train_loader.inputs, train_loader.targets, B, T, grad_accum_steps);
             lossf += model.mean_loss; // the mean_loss was normalized by grad_accum_steps inside gpt2_forward
             // backward pass. all model params accumulate gradients with += inside this inner loop
-            gpt2_backward(&model);
+            gpt2_backward(&model, train_loader.inputs);
         }
         // override the mean loss, accounting for the gradient accumulation loop
         // this is esp important to do here in multigpu update below, where model.mean_loss gets allreduced

From f7cbb303168de1b21aa8d9c8814e3a4a9237fae3 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Tue, 21 May 2024 15:32:46 +0000
Subject: [PATCH 122/172] step 2 of dataloader refactor: separate out the
 dataloader to its own file, change its signature a little bit, and (notably)
 change from cudaMallocHost to a simple malloc, so that this file doesn't have
 to be cuda aware

---
 dataloader.h  | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++
 train_gpt2.cu | 86 +++---------------------------------------------
 2 files changed, 96 insertions(+), 81 deletions(-)
 create mode 100644 dataloader.h

diff --git a/dataloader.h b/dataloader.h
new file mode 100644
index 000000000..4ca0239fa
--- /dev/null
+++ b/dataloader.h
@@ -0,0 +1,91 @@
+/*
+Implements a medium simple DataLoader for a distributed training setup.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdint.h>
+// defines: fopenCheck, freadCheck, fcloseCheck, fseekCheck
+// defines: mallocCheck
+#include "utils.h"
+
+// ----------------------------------------------------------------------------
+// Distributed Data Loader
+
+typedef struct {
+    // Distributed data parallel specifics.
+    // Each worker loads it's own chunk of data.
+    int process_rank;
+    int num_processes;
+    // hyperparameters. use size_t to prevent overflow
+    size_t B;
+    size_t T;
+    // input handling and its state
+    FILE* tokens_file;
+    long file_size;
+    long current_position;
+    // outputs
+    int* batch;
+    int* inputs;
+    int* targets;
+    // convenience variables
+    size_t num_batches;
+} DataLoader;
+
+void dataloader_init(DataLoader *loader,
+                     const char* filename,
+                     size_t B,
+                     size_t T,
+                     int process_rank,
+                     int num_processes) {
+    loader->process_rank = process_rank;
+    loader->num_processes = num_processes;
+    loader->B = B;
+    loader->T = T;
+
+    // open the input file for reading
+    loader->tokens_file = fopenCheck(filename, "rb");
+
+    // determine the file size
+    fseekCheck(loader->tokens_file, 0, SEEK_END);
+    loader->file_size = ftell(loader->tokens_file);
+    fseekCheck(loader->tokens_file, 0, SEEK_SET);
+    if (loader->file_size < (B * T + 1) * sizeof(int)) {
+        printf("Error: file size is too small for the batch size and sequence length\n");
+        exit(EXIT_FAILURE);
+    }
+    loader->current_position = loader->process_rank * B * T * sizeof(int); // start at the beginning
+
+    // allocate space for B*T + 1 integers to store the inputs and targets
+    loader->batch = (int*)malloc((B * T + 1) * sizeof(int));
+    loader->inputs = loader->batch;
+    loader->targets = loader->batch + 1; // targets are shifted by one
+    // note: we definitely want to advance by B * T; That is the "stride" by which we move
+    // the window of tokens. We only load B * T + 1 tokens because our targets are offset by 1
+    loader->num_batches = loader->file_size / (loader->num_processes * B * T * sizeof(int));
+}
+
+void dataloader_reset(DataLoader *loader) {
+    loader->current_position = 0;
+}
+
+void dataloader_next_batch(DataLoader *loader) {
+    size_t B = loader->B;
+    size_t T = loader->T;
+    // if we are at the end of the file, loop back to the beginning
+    if (loader->current_position + (loader->num_processes * B * T + 1) * sizeof(int) > loader->file_size) {
+        loader->current_position = loader->process_rank * B * T * sizeof(int);
+    }
+    // read the B*T+1 integers from the file into batch
+    fseekCheck(loader->tokens_file, loader->current_position, SEEK_SET);
+    freadCheck(loader->batch, sizeof(int), B*T+1, loader->tokens_file);
+    // advance the current position by B*T*num_processes integers
+    // note: the "stride" of tokens by which we move each time is definitely B * T
+    loader->current_position += loader->num_processes * B * T * sizeof(int);
+}
+
+void dataloader_free(DataLoader *loader) {
+    free(loader->batch);
+    fcloseCheck(loader->tokens_file);
+}
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 6584a4cd5..f3dfca5c2 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -54,6 +54,8 @@ This reads & runs in fp32, B=4, T=64, LR=1e-4, val/sample never (200),
 #include "utils.h"
 // defines: tokenizer_init, tokenizer_decode, tokenizer_free
 #include "tokenizer.h"
+// defines: dataloader_init, dataloader_reset, dataloader_next_batch, dataloader_free
+#include "dataloader.h"
 
 // ----------------------------------------------------------------------------
 // CUDA precision settings
@@ -2481,85 +2483,7 @@ void common_free(GPT2 &model) {
 }
 
 #ifndef TESTING
-// if we are TESTING (see test_gpt2.cu), we'll skip the int main below
-// ----------------------------------------------------------------------------
-// data loader lite: returns random batches of data from a file of integers
-
-typedef struct {
-    // Distributed data parallel specifics.
-    // Each worker loads it's own chunk of data.
-    int process_rank;
-    int num_processes;
-    // hyperparameters. use size_t to prevent overflow
-    size_t B;
-    size_t T;
-    // input handling and its state
-    FILE* tokens_file;
-    long file_size;
-    long current_position;
-    // output memory
-    int* batch;
-    int* inputs;
-    int* targets;
-    // convenience variables
-    size_t num_batches;
-} DataLoader;
-
-void dataloader_init(DataLoader *loader, const MultiGpuConfig* multi_gpu_config, const char* filename, size_t B, size_t T) {
-    loader->process_rank = multi_gpu_config->process_rank;
-    loader->num_processes = multi_gpu_config->num_processes;
-    loader->B = B;
-    loader->T = T;
-
-    // open the input file for reading
-    loader->tokens_file = fopenCheck(filename, "rb");
-
-    // determine the file size
-    fseekCheck(loader->tokens_file, 0, SEEK_END);
-    loader->file_size = ftell(loader->tokens_file);
-    fseekCheck(loader->tokens_file, 0, SEEK_SET);
-    if (loader->file_size < (B * T + 1) * sizeof(int)) {
-        printf("Error: file size is too small for the batch size and sequence length\n");
-        exit(EXIT_FAILURE);
-    }
-    loader->current_position = loader->process_rank * B * T * sizeof(int); // start at the beginning
-
-    // allocate space for B*T + 1 integers to store the inputs and targets
-    // Using CUDA CPU pinned memory for faster PCI Express transfers to GPU
-    // See: https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/
-    cudaMallocHost((void**)&loader->batch, (B * T + 1) * sizeof(int));
-    loader->inputs = loader->batch;
-    loader->targets = loader->batch + 1; // targets are shifted by one
-    // note: we definitely want to advance by B * T; That is the "stride" by which we move
-    // the window of tokens. We only load B * T + 1 tokens because our targets are offset by 1
-    loader->num_batches = loader->file_size / (loader->num_processes * B * T * sizeof(int));
-}
-
-void dataloader_reset(DataLoader *loader) {
-    loader->current_position = 0;
-}
-
-void dataloader_next_batch(DataLoader *loader) {
-    NVTX_RANGE_FN();
-    size_t B = loader->B;
-    size_t T = loader->T;
-    // if we are at the end of the file, loop back to the beginning
-    if (loader->current_position + (loader->num_processes * B * T + 1) * sizeof(int) > loader->file_size) {
-        loader->current_position = loader->process_rank * B * T * sizeof(int);
-    }
-    // read the B*T+1 integers from the file into batch
-    fseekCheck(loader->tokens_file, loader->current_position, SEEK_SET);
-    freadCheck(loader->batch, sizeof(int), B*T+1, loader->tokens_file);
-    // advance the current position by B*T*num_processes integers
-    // note: the "stride" of tokens by which we move each time is definitely B * T
-    loader->current_position += loader->num_processes * B * T * sizeof(int);
-}
-
-void dataloader_free(DataLoader *loader) {
-    fcloseCheck(loader->tokens_file);
-    cudaFreeHost(loader->batch);
-}
-
+// if we are TESTING (see test_gpt2.cu), we'll skip everything below this point
 // ----------------------------------------------------------------------------
 // sampler: takes probabilities and samples integers from them
 
@@ -2747,8 +2671,8 @@ int main(int argc, char *argv[]) {
     sprintf(train_tokens_filename, "%s_%s.bin", input_dataset_prefix, train_split);
     sprintf(val_tokens_filename, "%s_val.bin", input_dataset_prefix);
     DataLoader train_loader, val_loader;
-    dataloader_init(&train_loader, &multi_gpu_config, train_tokens_filename, B, T);
-    dataloader_init(&val_loader, &multi_gpu_config, val_tokens_filename, B, T);
+    dataloader_init(&train_loader, train_tokens_filename, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes);
+    dataloader_init(&val_loader, val_tokens_filename, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes);
     int train_num_batches = (max_steps == -1) ? train_loader.num_batches : max_steps; // default = 1 epoch
     int val_num_batches = train_loader.num_batches < val_max_batches ? train_loader.num_batches : val_max_batches;
     printf0("| train_num_batches     | %-50d |\n", train_num_batches);

From a3801f01efae3434d6e4cdbef3dd455fcc10404f Mon Sep 17 00:00:00 2001
From: ademeure <arundemeure@gmail.com>
Date: Tue, 21 May 2024 16:53:11 +0100
Subject: [PATCH 123/172] added algorithm header for std::sort on windows (not
 sure about compile time impact...)

---
 train_gpt2.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 899293f75..16f8a4216 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -39,6 +39,7 @@ This reads & runs in fp32, B=4, T=64, LR=1e-4, val/sample never (200),
 #include <stdarg.h>
 #include <string>
 #include <vector>
+#include <algorithm>
 #include <functional>
 #include <unordered_map>
 // GPU / CUDA related

From 1defbd4a19e4fcfe356175a4aac7dd01f6c2e56a Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Tue, 21 May 2024 17:05:27 +0000
Subject: [PATCH 124/172] adjust the dataloader to load the new .bin data
 files, and both prod datasets to use it instead

---
 dataloader.h                | 72 +++++++++++++++++++++++++------------
 dev/data/data_common.py     | 10 ++++--
 dev/data/fineweb.py         |  4 +--
 dev/data/tinyshakespeare.py | 11 ++----
 dev/data/tinystories.py     |  6 ++--
 5 files changed, 63 insertions(+), 40 deletions(-)

diff --git a/dataloader.h b/dataloader.h
index 4ca0239fa..fa4e62adc 100644
--- a/dataloader.h
+++ b/dataloader.h
@@ -12,6 +12,7 @@ Implements a medium simple DataLoader for a distributed training setup.
 
 // ----------------------------------------------------------------------------
 // Distributed Data Loader
+#define HEADER_SIZE 256
 
 typedef struct {
     // Distributed data parallel specifics.
@@ -26,13 +27,20 @@ typedef struct {
     long file_size;
     long current_position;
     // outputs
-    int* batch;
-    int* inputs;
-    int* targets;
+    uint16_t* buffer; // used to fread data from file into
+    int* inputs;  // input tokens into transformer
+    int* targets; // target tokens for the transformer
     // convenience variables
     size_t num_batches;
 } DataLoader;
 
+void dataloader_reset(DataLoader *loader) {
+    // each process starts at a different offset in the file
+    long header_bytes = HEADER_SIZE * sizeof(int);
+    long token_bytes_offset = loader->process_rank * loader->B * loader->T * sizeof(uint16_t);
+    loader->current_position = header_bytes + token_bytes_offset;
+}
+
 void dataloader_init(DataLoader *loader,
                      const char* filename,
                      size_t B,
@@ -46,46 +54,64 @@ void dataloader_init(DataLoader *loader,
 
     // open the input file for reading
     loader->tokens_file = fopenCheck(filename, "rb");
+    // validate the header
+    int header[HEADER_SIZE];
+    freadCheck(header, sizeof(int), HEADER_SIZE, loader->tokens_file);
+    if (header[0] != 20240520) { printf("Bad magic in data file\n"); exit(EXIT_FAILURE); }
+    if (header[1] != 1) { printf("Bad version in data file\n"); exit(EXIT_FAILURE); }
+    long ntok = header[2]; // number of tokens in the file
 
-    // determine the file size
-    fseekCheck(loader->tokens_file, 0, SEEK_END);
-    loader->file_size = ftell(loader->tokens_file);
-    fseekCheck(loader->tokens_file, 0, SEEK_SET);
-    if (loader->file_size < (B * T + 1) * sizeof(int)) {
-        printf("Error: file size is too small for the batch size and sequence length\n");
+    // determine the file size and make sure it is consistent with the number of tokens
+    fseekCheck(loader->tokens_file, 0, SEEK_END); // seek to end of file
+    loader->file_size = ftell(loader->tokens_file); // read the offset, i.e. file size
+    fseekCheck(loader->tokens_file, 0, SEEK_SET); // seek back to the beginning
+    // we expect ntok in the file to be consistent with filesize, assert that is the case
+    long expected_file_size = HEADER_SIZE * sizeof(int) + ntok * sizeof(uint16_t);
+    if (loader->file_size != expected_file_size) {
+        printf("Error: file size is not as expected\n");
+        exit(EXIT_FAILURE);
+    }
+    if (ntok < num_processes * B * T + 1) {
+        // being too defensive/lazy, we could tolerate as low as T+1 tokens in principle
+        printf("Error: there are too few tokens\n");
         exit(EXIT_FAILURE);
     }
-    loader->current_position = loader->process_rank * B * T * sizeof(int); // start at the beginning
 
     // allocate space for B*T + 1 integers to store the inputs and targets
-    loader->batch = (int*)malloc((B * T + 1) * sizeof(int));
-    loader->inputs = loader->batch;
-    loader->targets = loader->batch + 1; // targets are shifted by one
+    loader->buffer = (uint16_t*)malloc((B * T + 1) * sizeof(uint16_t));
+    loader->inputs = (int*)malloc(B * T * sizeof(int));
+    loader->targets = (int*)malloc(B * T * sizeof(int));
     // note: we definitely want to advance by B * T; That is the "stride" by which we move
     // the window of tokens. We only load B * T + 1 tokens because our targets are offset by 1
-    loader->num_batches = loader->file_size / (loader->num_processes * B * T * sizeof(int));
-}
+    loader->num_batches = ntok / (num_processes * B * T);
 
-void dataloader_reset(DataLoader *loader) {
-    loader->current_position = 0;
+    // reset the loader to the beginning of the file
+    dataloader_reset(loader);
 }
 
 void dataloader_next_batch(DataLoader *loader) {
     size_t B = loader->B;
     size_t T = loader->T;
     // if we are at the end of the file, loop back to the beginning
-    if (loader->current_position + (loader->num_processes * B * T + 1) * sizeof(int) > loader->file_size) {
-        loader->current_position = loader->process_rank * B * T * sizeof(int);
+    if (loader->current_position + (loader->num_processes * B * T + 1) * sizeof(uint16_t) > loader->file_size) {
+        dataloader_reset(loader);
     }
-    // read the B*T+1 integers from the file into batch
+    // read B*T+1 uint16_t tokens from the file into buffer
     fseekCheck(loader->tokens_file, loader->current_position, SEEK_SET);
-    freadCheck(loader->batch, sizeof(int), B*T+1, loader->tokens_file);
+    freadCheck(loader->buffer, sizeof(uint16_t), B*T+1, loader->tokens_file);
+    // decode the buffer into inputs and targets (cast to int)
+    for (int i = 0; i < B*T; i++) {
+        loader->inputs[i] = (int)loader->buffer[i];
+        loader->targets[i] = (int)loader->buffer[i+1];
+    }
     // advance the current position by B*T*num_processes integers
     // note: the "stride" of tokens by which we move each time is definitely B * T
-    loader->current_position += loader->num_processes * B * T * sizeof(int);
+    loader->current_position += loader->num_processes * B * T * sizeof(uint16_t);
 }
 
 void dataloader_free(DataLoader *loader) {
-    free(loader->batch);
+    free(loader->buffer);
+    free(loader->inputs);
+    free(loader->targets);
     fcloseCheck(loader->tokens_file);
 }
diff --git a/dev/data/data_common.py b/dev/data/data_common.py
index c3147de76..ec85cb90b 100644
--- a/dev/data/data_common.py
+++ b/dev/data/data_common.py
@@ -23,8 +23,12 @@ def download_file(url: str, fname: str, chunk_size=1024):
             bar.update(size)
 
 
-def write_shard(filename, toks):
-    """Saves token data as a .bin file, for reading in C"""
+def write_datafile(filename, toks):
+    """
+    Saves token data as a .bin file, for reading in C.
+    - First comes a header with 256 int32s
+    - The tokens follow, each as a uint16
+    """
     assert len(toks) < 2**31, "token count too large" # ~2.1B tokens
     # construct the header
     header = np.zeros(256, dtype=np.int32)
@@ -37,7 +41,7 @@ def write_shard(filename, toks):
     # construct the tokens
     toks_np = np.array(toks, dtype=np.uint16)
     # write to file
-    print(f"writing {filename}")
+    print(f"writing {len(toks):,} tokens to {filename}")
     with open(filename, "wb") as f:
         f.write(header.tobytes())
         f.write(toks_np.tobytes())
diff --git a/dev/data/fineweb.py b/dev/data/fineweb.py
index 8b6ef4bfa..41091ba6a 100644
--- a/dev/data/fineweb.py
+++ b/dev/data/fineweb.py
@@ -25,7 +25,7 @@
 from tqdm import tqdm
 import argparse
 
-from data_common import write_shard
+from data_common import write_datafile
 # ------------------------------------------
 
 parser = argparse.ArgumentParser(description="FineWeb dataset preprocessing")
@@ -75,7 +75,7 @@ def tokenize(doc):
         filename = os.path.join(DATA_CACHE_DIR, f"fineweb_{shard_index:06d}.bin")
         write_tokens = all_tokens[:args.shard_size]
         rest_tokens = all_tokens[args.shard_size:]
-        write_shard(filename, write_tokens)
+        write_datafile(filename, write_tokens)
         shard_index += 1
         progress_bar = None
         # note: create a copy so Python can free the all_tokens memory above
diff --git a/dev/data/tinyshakespeare.py b/dev/data/tinyshakespeare.py
index 6d795aef7..6b7cbb976 100644
--- a/dev/data/tinyshakespeare.py
+++ b/dev/data/tinyshakespeare.py
@@ -17,7 +17,7 @@
 import os
 import tiktoken
 import numpy as np
-from data_common import download_file
+from data_common import download_file, write_datafile
 
 # -----------------------------------------------------------------------------
 DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "tinyshakespeare")
@@ -52,13 +52,8 @@ def tokenize():
     # save to file
     val_filename = os.path.join(DATA_CACHE_DIR, "tiny_shakespeare_val.bin")
     train_filename = os.path.join(DATA_CACHE_DIR, "tiny_shakespeare_train.bin")
-    with open(val_filename, "wb") as f:
-        f.write(val_tokens_np.tobytes())
-    with open(train_filename, "wb") as f:
-        f.write(train_tokens_np.tobytes())
-    # prints
-    print(f"Saved {len(val_tokens_np)} tokens to {val_filename}")
-    print(f"Saved {len(train_tokens_np)} tokens to {train_filename}")
+    write_datafile(val_filename, val_tokens_np)
+    write_datafile(train_filename, train_tokens_np)
 
 if __name__ == "__main__":
     download()
diff --git a/dev/data/tinystories.py b/dev/data/tinystories.py
index 628e5a7bb..83621e1d8 100644
--- a/dev/data/tinystories.py
+++ b/dev/data/tinystories.py
@@ -25,7 +25,7 @@
 from concurrent.futures import ProcessPoolExecutor, as_completed
 import tiktoken
 import numpy as np
-from data_common import download_file
+from data_common import download_file, write_datafile
 
 # -----------------------------------------------------------------------------
 DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "tinystories")
@@ -96,9 +96,7 @@ def tokenize():
 
         all_tokens_np = np.array(all_tokens, dtype=np.int32)
         split_filename = os.path.join(DATA_CACHE_DIR, f"TinyStories_{split_name}.bin")
-        with open(split_filename, "wb") as f:
-            f.write(all_tokens_np.tobytes())
-        print(f"Saved {len(all_tokens_np)} tokens to {split_filename}")
+        write_datafile(split_filename, all_tokens_np)
 
 if __name__ == "__main__":
     download()

From 666145e7b8ef033c4a7eeda6392d759235f64afb Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Tue, 21 May 2024 17:30:56 +0000
Subject: [PATCH 125/172] also fix the .c file and fp32 file

---
 train_gpt2.c       | 84 +++-------------------------------------------
 train_gpt2_fp32.cu | 78 +++---------------------------------------
 2 files changed, 10 insertions(+), 152 deletions(-)

diff --git a/train_gpt2.c b/train_gpt2.c
index 41ec3147e..57296736b 100644
--- a/train_gpt2.c
+++ b/train_gpt2.c
@@ -25,6 +25,8 @@ There will be other versions of this code that specialize it and make it fast.
 #include "utils.h"
 // defines: tokenizer_init, tokenizer_decode, tokenizer_free
 #include "tokenizer.h"
+// defines: dataloader_init, dataloader_reset, dataloader_next_batch, dataloader_free
+#include "dataloader.h"
 
 // ----------------------------------------------------------------------------
 // all the individual layers' forward and backward passes
@@ -992,81 +994,6 @@ void gpt2_free(GPT2 *model) {
 
 #ifndef TESTING
 // if we are TESTING (see test_gpt2.c), we'll skip the int main below
-
-// ----------------------------------------------------------------------------
-// data loader lite
-// returns random batches of data from a file of integers
-
-typedef struct {
-    // hyperparameters
-    int B; // batch size
-    int T; // sequence length
-    // input handling and its state
-    FILE* tokens_file;
-    long file_size;
-    long current_position;
-    // output memory
-    int* batch;
-    int* inputs;
-    int* targets;
-    // convenience variables
-    int num_batches;
-} DataLoader;
-
-void dataloader_init(DataLoader *loader, const char* filename, int B, int T) {
-    loader->B = B;
-    loader->T = T;
-
-    // open the input file for reading
-    loader->tokens_file = fopen(filename, "rb");
-    if (loader->tokens_file == NULL) {
-        printf("Error opening tokens file\n");
-        printf("--> HINT: the data directory may have moved recently from data/ to dev/data/(dataset)/");
-        printf("--> HINT: refer again to the README file and possibly re-run the dataset prepro script.");
-        printf("--> HINT: example: re-run `python dev/data/tinyshakespeare.py`");
-        exit(1);
-    }
-
-    // determine the file size
-    fseekCheck(loader->tokens_file, 0, SEEK_END);
-    loader->file_size = ftell(loader->tokens_file);
-    fseekCheck(loader->tokens_file, 0, SEEK_SET);
-    if (loader->file_size < (B * T + 1) * sizeof(int)) {
-        printf("Error: file size is too small for the batch size and sequence length\n");
-        exit(1);
-    }
-    loader->current_position = 0; // start at the beginning
-
-    // allocate space for B*T + 1 integers to store the inputs and targets
-    loader->batch = (int*) mallocCheck((B * T + 1) * sizeof(int));
-    loader->inputs = loader->batch;
-    loader->targets = loader->batch + 1; // targets are shifted by one
-    loader->num_batches = loader->file_size / (B * T * sizeof(int));
-}
-
-void dataloader_reset(DataLoader *loader) {
-    loader->current_position = 0;
-}
-
-void dataloader_next_batch(DataLoader *loader) {
-    int B = loader->B;
-    int T = loader->T;
-    // if we are at the end of the file, loop back to the beginning
-    if (loader->current_position + (B*T+1) * sizeof(int) > loader->file_size) {
-        loader->current_position = 0;
-    }
-    // read the B*T+1 integers from the file into batch
-    fseekCheck(loader->tokens_file, loader->current_position, SEEK_SET);
-    freadCheck(loader->batch, sizeof(int), B*T+1, loader->tokens_file);
-    // advance the current position by B*T integers
-    loader->current_position += B*T * sizeof(int);
-}
-
-void dataloader_free(DataLoader *loader) {
-    fcloseCheck(loader->tokens_file);
-    free(loader->batch);
-}
-
 // ----------------------------------------------------------------------------
 // sampler
 
@@ -1111,11 +1038,10 @@ int main() {
     const char* val_tokens = access(tiny_shakespeare_val, F_OK) != -1 ? tiny_shakespeare_val : tiny_stories_val;
     int B = 4; // batch size 4 (i.e. 4 independent token sequences will be trained on)
     int T = 64; // sequence length 64 (i.e. each sequence is 64 tokens long). must be <= maxT, which is 1024 for GPT-2
-    DataLoader train_loader;
-    dataloader_init(&train_loader, train_tokens, B, T);
+    DataLoader train_loader, val_loader;
+    dataloader_init(&train_loader, train_tokens, B, T, 0, 1);
+    dataloader_init(&val_loader, val_tokens, B, T, 0, 1);
     printf("train dataset num_batches: %d\n", train_loader.num_batches);
-    DataLoader val_loader;
-    dataloader_init(&val_loader, val_tokens, B, T);
     printf("val dataset num_batches: %d\n", val_loader.num_batches);
     int val_num_batches = 5;
 
diff --git a/train_gpt2_fp32.cu b/train_gpt2_fp32.cu
index d2cf53b43..9a2dc6bb7 100644
--- a/train_gpt2_fp32.cu
+++ b/train_gpt2_fp32.cu
@@ -31,6 +31,8 @@ the layernorms are connected to the residuals so we += in layernorm backward.
 #include "utils.h"
 // defines: tokenizer_init, tokenizer_decode, tokenizer_free
 #include "tokenizer.h"
+// defines: dataloader_init, dataloader_reset, dataloader_next_batch, dataloader_free
+#include "dataloader.h"
 
 // ----------------------------------------------------------------------------
 // CUDA utils
@@ -1453,75 +1455,6 @@ void gpt2_free(GPT2 *model) {
 
 #ifndef TESTING
 // if we are TESTING (see test_gpt2.cu), we'll skip the int main below
-
-// ----------------------------------------------------------------------------
-// data loader lite: returns random batches of data from a file of integers
-
-typedef struct {
-    // hyperparameters
-    int B;
-    int T;
-    // input handling and its state
-    FILE* tokens_file;
-    long file_size;
-    long current_position;
-    // output memory
-    int* batch;
-    int* inputs;
-    int* targets;
-    // convenience variables
-    long num_batches;
-} DataLoader;
-
-void dataloader_init(DataLoader *loader, const char* filename, int B, int T) {
-    loader->B = B;
-    loader->T = T;
-
-    // open the input file for reading
-    loader->tokens_file = fopenCheck(filename, "rb");
-
-    // determine the file size
-    fseekCheck(loader->tokens_file, 0, SEEK_END);
-    loader->file_size = ftell(loader->tokens_file);
-    fseekCheck(loader->tokens_file, 0, SEEK_SET);
-    if (loader->file_size < (B * T + 1) * sizeof(int)) {
-        printf("Error: file size is too small for the batch size and sequence length\n");
-        exit(EXIT_FAILURE);
-    }
-    loader->current_position = 0; // start at the beginning
-
-    // allocate space for B*T + 1 integers to store the inputs and targets
-    // Using CUDA CPU pinned memory for faster PCI Express transfers to GPU
-    // See: https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/
-    cudaMallocHost((void**)&loader->batch, (B * T + 1) * sizeof(int));
-    loader->inputs = loader->batch;
-    loader->targets = loader->batch + 1; // targets are shifted by one
-    loader->num_batches = loader->file_size / (B * T * sizeof(int));
-}
-
-void dataloader_reset(DataLoader *loader) {
-    loader->current_position = 0;
-}
-
-void dataloader_next_batch(DataLoader *loader) {
-    int B = loader->B;
-    int T = loader->T;
-    // if we are at the end of the file, loop back to the beginning
-    if (loader->current_position + (B*T+1) * sizeof(int) > loader->file_size) {
-        loader->current_position = 0;
-    }
-    // read the B*T+1 integers from the file into batch
-    fseekCheck(loader->tokens_file, loader->current_position, SEEK_SET);
-    freadCheck(loader->batch, sizeof(int), B*T+1, loader->tokens_file);
-    // advance the current position by B*T integers
-    loader->current_position += B*T * sizeof(int);
-}
-
-void dataloader_free(DataLoader *loader) {
-    fcloseCheck(loader->tokens_file);
-    cudaFreeHost(loader->batch);
-}
-
 // ----------------------------------------------------------------------------
 // sampler: takes probabilities and samples integers from them
 
@@ -1689,10 +1622,9 @@ int main(int argc, char *argv[]) {
     assert(strlen(input_dataset_prefix) < 100); // being bit lazy here, make sure we don't overflow
     sprintf(train_tokens_filename, "%s_train.bin", input_dataset_prefix);
     sprintf(val_tokens_filename, "%s_val.bin", input_dataset_prefix);
-    DataLoader train_loader;
-    dataloader_init(&train_loader, train_tokens_filename, B, T);
-    DataLoader val_loader;
-    dataloader_init(&val_loader, val_tokens_filename, B, T);
+    DataLoader train_loader, val_loader;
+    dataloader_init(&train_loader, train_tokens_filename, B, T, 0, 1);
+    dataloader_init(&val_loader, val_tokens_filename, B, T, 0, 1);
     int train_num_batches = train_loader.num_batches; // let's do 1 epoch by default for now
     int val_num_batches = train_loader.num_batches < val_max_batches ? train_loader.num_batches : val_max_batches;
     printf("| train_num_batches     | %-50d |\n", train_num_batches);

From 9bad49a99a47ae777a0a5a211db1294ab3e2bd0d Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Tue, 21 May 2024 17:46:14 +0000
Subject: [PATCH 126/172] also fix the python file. that should be it now,
 w.r.t. the new token format .bin files

---
 train_gpt2.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/train_gpt2.py b/train_gpt2.py
index 4d61e68cd..efb64695f 100644
--- a/train_gpt2.py
+++ b/train_gpt2.py
@@ -502,7 +502,14 @@ def print0(*args, **kwargs):
         exit(1)
     print0(f"loading cached tokens in {args.input_bin}")
     with open(args.input_bin, "rb") as f:
-        tokens = np.frombuffer(f.read(), dtype=np.int32)
+        # first read the header, which is 256 int32 integers (4 bytes each)
+        header = np.frombuffer(f.read(256*4), dtype=np.int32)
+        assert header[0] == 20240520, "magic number mismatch, corrupt file?"
+        assert header[1] == 1, "unsupported version"
+        ntok = header[2] # number of tokens (claimed)
+        # the rest of it are tokens, stored as uint16
+        tokens = np.frombuffer(f.read(), dtype=np.uint16)
+        assert len(tokens) == ntok, "number of tokens read does not match header?"
 
     # np -> tensor, long, on device
     tokens = torch.tensor(tokens)

From d53608820a610c3b338ab90fb5bd2f74ec35281e Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Tue, 21 May 2024 17:55:56 +0000
Subject: [PATCH 127/172] the write_datafile function accepts python list,
 which i think is faster but i didn't check

---
 dev/data/data_common.py     | 2 +-
 dev/data/tinyshakespeare.py | 9 ++++-----
 dev/data/tinystories.py     | 3 +--
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/dev/data/data_common.py b/dev/data/data_common.py
index ec85cb90b..8bae1274d 100644
--- a/dev/data/data_common.py
+++ b/dev/data/data_common.py
@@ -38,7 +38,7 @@ def write_datafile(filename, toks):
     # validate that no token exceeds a uint16
     maxtok = 2**16
     assert all(0 <= t < maxtok for t in toks), "token dictionary too large for uint16"
-    # construct the tokens
+    # construct the tokens numpy array
     toks_np = np.array(toks, dtype=np.uint16)
     # write to file
     print(f"writing {len(toks):,} tokens to {filename}")
diff --git a/dev/data/tinyshakespeare.py b/dev/data/tinyshakespeare.py
index 6b7cbb976..d9b4b6e22 100644
--- a/dev/data/tinyshakespeare.py
+++ b/dev/data/tinyshakespeare.py
@@ -45,15 +45,14 @@ def tokenize():
     text = text.replace('\n\n', '\n\n<|endoftext|>')
     # encode the text
     tokens = encode(text)
-    tokens_np = np.array(tokens, dtype=np.int32)
     # let's take the first 32,768 tokens as the validation split (~10%)
-    val_tokens_np = tokens_np[:32768]
-    train_tokens_np = tokens_np[32768:]
+    val_tokens = tokens[:32768]
+    train_tokens = tokens[32768:]
     # save to file
     val_filename = os.path.join(DATA_CACHE_DIR, "tiny_shakespeare_val.bin")
     train_filename = os.path.join(DATA_CACHE_DIR, "tiny_shakespeare_train.bin")
-    write_datafile(val_filename, val_tokens_np)
-    write_datafile(train_filename, train_tokens_np)
+    write_datafile(val_filename, val_tokens)
+    write_datafile(train_filename, train_tokens)
 
 if __name__ == "__main__":
     download()
diff --git a/dev/data/tinystories.py b/dev/data/tinystories.py
index 83621e1d8..fed8bc61c 100644
--- a/dev/data/tinystories.py
+++ b/dev/data/tinystories.py
@@ -94,9 +94,8 @@ def tokenize():
             for future in as_completed(futures):
                 all_tokens.extend(future.result())
 
-        all_tokens_np = np.array(all_tokens, dtype=np.int32)
         split_filename = os.path.join(DATA_CACHE_DIR, f"TinyStories_{split_name}.bin")
-        write_datafile(split_filename, all_tokens_np)
+        write_datafile(split_filename, all_tokens)
 
 if __name__ == "__main__":
     download()

From ccc240caab7ee7379a5f1aa4488cbb69e39fa469 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Tue, 21 May 2024 18:01:35 +0000
Subject: [PATCH 128/172] make comment more helpful

---
 dataloader.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/dataloader.h b/dataloader.h
index fa4e62adc..110276929 100644
--- a/dataloader.h
+++ b/dataloader.h
@@ -57,7 +57,12 @@ void dataloader_init(DataLoader *loader,
     // validate the header
     int header[HEADER_SIZE];
     freadCheck(header, sizeof(int), HEADER_SIZE, loader->tokens_file);
-    if (header[0] != 20240520) { printf("Bad magic in data file\n"); exit(EXIT_FAILURE); }
+    if (header[0] != 20240520) {
+        printf("Bad magic in the data file\n");
+        printf("---> HINT: Are you passing in a correct file?\n");
+        printf("---> HINT: The data encoding may have changed, re-run data prepro or refer again to README.\n");
+        exit(EXIT_FAILURE);
+    }
     if (header[1] != 1) { printf("Bad version in data file\n"); exit(EXIT_FAILURE); }
     long ntok = header[2]; // number of tokens in the file
 

From f7cb77f3d955143627fb36ffd072436b8c930d52 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Tue, 21 May 2024 18:20:51 +0000
Subject: [PATCH 129/172] docs on master-breaking change around how we store
 data .bin files

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index aee282fe0..14cdc8da1 100644
--- a/README.md
+++ b/README.md
@@ -373,6 +373,10 @@ Lastly, I will be a lot more sensitive to complexity in the root folder of the p
   - [llm.zig](https://github.com/Saimirbaci/llm.zig) by @[saimirbaci](https://github.com/Saimirbaci): a Zig port of this project
 
 
+## major changes log
+
+- May 21, 2024: I refactored the .bin files that hold the tokens to include a header like all the other .bin files that e.g. store the model weights. This was necessary to support multiple versions and future development. Unfortunately, this will brick everyone's master the next time you `git pull`, because the .bin files you've generated before are the legacy version. To fix this, you only have to re-generate the data in the new format. For example, for Tiny Shakespeare run: `python dev/data/tinyshakespeare.py`. For Tiny Stories, `python dev/data/tinystories.py`. Also notice that the location of these data files has changed. They used to just be "flat" and inside `data/` folder, but now all the data-related code was moved to `dev/data` files and sub-directories, to keep things organized. Apologies for breaking change, I'll try not to brick master too much in general.
+
 ## discussions
 
 Ways of organizing development:

From 7d58fd2abb0672a6074f6e9c87bfff27676048cc Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Tue, 21 May 2024 18:29:55 +0000
Subject: [PATCH 130/172] adjust py file as well and make the errors better

---
 train_gpt2.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/train_gpt2.py b/train_gpt2.py
index efb64695f..b2def106b 100644
--- a/train_gpt2.py
+++ b/train_gpt2.py
@@ -497,14 +497,19 @@ def print0(*args, **kwargs):
     # note we're using val by default instead of train split just because it is smaller/faster
     if not os.path.isfile(args.input_bin):
         print0(f"ERROR: input .bin file not found: {args.input_bin}")
-        print0("---> HINT: try to re-run the data prepro script. these recently moved to dev/data")
-        print0("---> HINT: for example re-run: `python dev/data/tinyshakespeare.py`, then re-try")
+        print0("---> HINT: Try to re-run the data prepro script. these recently moved to dev/data")
+        print0("---> HINT: For example re-run: `python dev/data/tinyshakespeare.py`, then re-try")
         exit(1)
     print0(f"loading cached tokens in {args.input_bin}")
     with open(args.input_bin, "rb") as f:
         # first read the header, which is 256 int32 integers (4 bytes each)
         header = np.frombuffer(f.read(256*4), dtype=np.int32)
-        assert header[0] == 20240520, "magic number mismatch, corrupt file?"
+        if header[0] != 20240520:
+            print0("ERROR: magic number mismatch in the data .bin file!")
+            print0("---> HINT: Are you passing in a correct file with --input_bin?")
+            print0("---> HINT: Dataset encoding changed recently, re-run data prepro or refer again to README")
+            print0("---> HINT: For example re-run: `python dev/data/tinyshakespeare.py`, then re-try")
+            exit(1)
         assert header[1] == 1, "unsupported version"
         ntok = header[2] # number of tokens (claimed)
         # the rest of it are tokens, stored as uint16

From 54ccbd300c49a4421144c448bfe5f83666c4cb9a Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Tue, 21 May 2024 18:31:19 +0000
Subject: [PATCH 131/172] docs on master-breaking changes around dataset file
 representation

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 14cdc8da1..a641afda0 100644
--- a/README.md
+++ b/README.md
@@ -375,7 +375,7 @@ Lastly, I will be a lot more sensitive to complexity in the root folder of the p
 
 ## major changes log
 
-- May 21, 2024: I refactored the .bin files that hold the tokens to include a header like all the other .bin files that e.g. store the model weights. This was necessary to support multiple versions and future development. Unfortunately, this will brick everyone's master the next time you `git pull`, because the .bin files you've generated before are the legacy version. To fix this, you only have to re-generate the data in the new format. For example, for Tiny Shakespeare run: `python dev/data/tinyshakespeare.py`. For Tiny Stories, `python dev/data/tinystories.py`. Also notice that the location of these data files has changed. They used to just be "flat" and inside `data/` folder, but now all the data-related code was moved to `dev/data` files and sub-directories, to keep things organized. Apologies for breaking change, I'll try not to brick master too much in general.
+- **May 21, 2024: Dataset refactor**. I refactored the .bin files that hold the tokens to include a header like all the other .bin files that e.g. store the model weights. This was necessary to support multiple versions and future development. Unfortunately, this will brick everyone's master the next time you `git pull`, because the .bin files you've generated before are the legacy version. To fix this, you only have to re-generate the data in the new format. For example, for Tiny Shakespeare run: `python dev/data/tinyshakespeare.py`. For Tiny Stories, `python dev/data/tinystories.py`. Also notice that the location of these data files has changed. They used to just be "flat" and inside `data/` folder, but now all the data-related code was moved to `dev/data` files and sub-directories, to keep things organized. Apologies for breaking change, I'll try not to brick master too much in general.
 
 ## discussions
 

From 587506d09604ca76415fdf3ac5ab62ca0542d9a8 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Tue, 21 May 2024 19:22:09 +0000
Subject: [PATCH 132/172] torch tensor can't handle uint16 so let's convert to
 int32, which is silly because we'll convert to .long right after but ok

---
 train_gpt2.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/train_gpt2.py b/train_gpt2.py
index b2def106b..f844004d2 100644
--- a/train_gpt2.py
+++ b/train_gpt2.py
@@ -514,6 +514,8 @@ def print0(*args, **kwargs):
         ntok = header[2] # number of tokens (claimed)
         # the rest of it are tokens, stored as uint16
         tokens = np.frombuffer(f.read(), dtype=np.uint16)
+        # convert tokens to int32 because torch can't handle uint16 sad
+        tokens = tokens.astype(np.int32)
         assert len(tokens) == ntok, "number of tokens read does not match header?"
 
     # np -> tensor, long, on device

From 967420d1d13109c50318a332a0e5c6cb9fef395a Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Tue, 21 May 2024 19:30:02 +0000
Subject: [PATCH 133/172] fix print format warning for size_t vs int

---
 train_gpt2.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/train_gpt2.c b/train_gpt2.c
index 57296736b..57bdfe929 100644
--- a/train_gpt2.c
+++ b/train_gpt2.c
@@ -1041,8 +1041,8 @@ int main() {
     DataLoader train_loader, val_loader;
     dataloader_init(&train_loader, train_tokens, B, T, 0, 1);
     dataloader_init(&val_loader, val_tokens, B, T, 0, 1);
-    printf("train dataset num_batches: %d\n", train_loader.num_batches);
-    printf("val dataset num_batches: %d\n", val_loader.num_batches);
+    printf("train dataset num_batches: %zu\n", train_loader.num_batches);
+    printf("val dataset num_batches: %zu\n", val_loader.num_batches);
     int val_num_batches = 5;
 
     // build the Tokenizer

From 31310282e164902b7e37dc6883aa7e29666eb9df Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Tue, 21 May 2024 21:33:24 +0000
Subject: [PATCH 134/172] extend dataloader to be sharded

---
 dataloader.h        | 110 +++++++++++++++++++++++++++++++-------------
 dev/data/fineweb.py |   3 +-
 train_gpt2.cu       |  29 ++++++------
 3 files changed, 93 insertions(+), 49 deletions(-)

diff --git a/dataloader.h b/dataloader.h
index 110276929..a9864fbe7 100644
--- a/dataloader.h
+++ b/dataloader.h
@@ -2,6 +2,7 @@
 Implements a medium simple DataLoader for a distributed training setup.
 */
 
+#include <glob.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
@@ -23,6 +24,8 @@ typedef struct {
     size_t B;
     size_t T;
     // input handling and its state
+    glob_t glob_result; // stores the result of glob, for all shards we want to iterate
+    int current_shard; // the current shard we are reading from
     FILE* tokens_file;
     long file_size;
     long current_position;
@@ -34,25 +37,13 @@ typedef struct {
     size_t num_batches;
 } DataLoader;
 
-void dataloader_reset(DataLoader *loader) {
-    // each process starts at a different offset in the file
-    long header_bytes = HEADER_SIZE * sizeof(int);
-    long token_bytes_offset = loader->process_rank * loader->B * loader->T * sizeof(uint16_t);
-    loader->current_position = header_bytes + token_bytes_offset;
-}
-
-void dataloader_init(DataLoader *loader,
-                     const char* filename,
-                     size_t B,
-                     size_t T,
-                     int process_rank,
-                     int num_processes) {
-    loader->process_rank = process_rank;
-    loader->num_processes = num_processes;
-    loader->B = B;
-    loader->T = T;
-
-    // open the input file for reading
+long dataloader_load_shard_(DataLoader *loader, int shard_index) {
+    // use the first glob match as the filename for now
+    const char* filename = loader->glob_result.gl_pathv[shard_index];
+    // open the input file for reading. also only a single file can be opened at a time
+    if (loader->tokens_file != NULL) {
+        fcloseCheck(loader->tokens_file);
+    }
     loader->tokens_file = fopenCheck(filename, "rb");
     // validate the header
     int header[HEADER_SIZE];
@@ -65,7 +56,7 @@ void dataloader_init(DataLoader *loader,
     }
     if (header[1] != 1) { printf("Bad version in data file\n"); exit(EXIT_FAILURE); }
     long ntok = header[2]; // number of tokens in the file
-
+    assert(ntok > 0); // we expect some tokens in the file. this should never trip, right?
     // determine the file size and make sure it is consistent with the number of tokens
     fseekCheck(loader->tokens_file, 0, SEEK_END); // seek to end of file
     loader->file_size = ftell(loader->tokens_file); // read the offset, i.e. file size
@@ -76,31 +67,80 @@ void dataloader_init(DataLoader *loader,
         printf("Error: file size is not as expected\n");
         exit(EXIT_FAILURE);
     }
-    if (ntok < num_processes * B * T + 1) {
-        // being too defensive/lazy, we could tolerate as low as T+1 tokens in principle
-        printf("Error: there are too few tokens\n");
+    return ntok;
+}
+
+void dataloader_reset(DataLoader *loader) {
+    // fully resets the DataLoader object to init configuration
+    // each process starts at a different offset in the file
+    long header_bytes = HEADER_SIZE * sizeof(int);
+    long token_bytes_offset = loader->process_rank * loader->B * loader->T * sizeof(uint16_t);
+    loader->current_shard = 0;
+    loader->current_position = header_bytes + token_bytes_offset;
+    dataloader_load_shard_(loader, loader->current_shard);
+}
+
+void dataloader_advance_(DataLoader *loader) {
+    // advance the loader by loading the next data shard and resetting the position
+    if (loader->glob_result.gl_pathc > 1) {
+        // if we have more than one shard, advance to the next one
+        loader->current_shard = (loader->current_shard + 1) % loader->glob_result.gl_pathc;
+        dataloader_load_shard_(loader, loader->current_shard);
+    }
+    long header_bytes = HEADER_SIZE * sizeof(int);
+    long token_bytes_offset = loader->process_rank * loader->B * loader->T * sizeof(uint16_t);
+    loader->current_position = header_bytes + token_bytes_offset;
+}
+
+void dataloader_init(DataLoader *loader,
+                     const char* filename_pattern,
+                     size_t B,
+                     size_t T,
+                     int process_rank,
+                     int num_processes) {
+    loader->process_rank = process_rank;
+    loader->num_processes = num_processes;
+    loader->B = B;
+    loader->T = T;
+    loader->tokens_file = NULL;
+
+    // glob to get the list of files matching the pattern, these are our data shards
+    int glob_status = glob(filename_pattern, 0, NULL, &loader->glob_result);
+    if (glob_status != 0) {
+        printf("Error: failed to glob pattern: %s\n", filename_pattern);
+        exit(EXIT_FAILURE);
+    }
+    if (loader->glob_result.gl_pathc == 0) {
+        printf("Error: no files found matching the pattern: %s\n", filename_pattern);
         exit(EXIT_FAILURE);
     }
 
-    // allocate space for B*T + 1 integers to store the inputs and targets
+    // inspect and validate all shards so we don't get any runtime errors later
+    // if too slow / too many shards, may wish to revisit later
+    long ntok_total = 0;
+    for (int shard_index = 0; shard_index < loader->glob_result.gl_pathc; shard_index++) {
+        long shard_ntok = dataloader_load_shard_(loader, shard_index);
+        // we need at least one batch/shard, the way things are written right now.
+        // can be relaxed a lot later.
+        assert(shard_ntok >= num_processes * B * T + 1);
+        ntok_total += shard_ntok;
+    }
+    printf("DataLoader: filename_pattern: %s\n", filename_pattern);
+    printf("DataLoader: Found %ld tokens across %zu shards\n", ntok_total, loader->glob_result.gl_pathc);
+
+    // allocate all the space we'll need
     loader->buffer = (uint16_t*)malloc((B * T + 1) * sizeof(uint16_t));
     loader->inputs = (int*)malloc(B * T * sizeof(int));
     loader->targets = (int*)malloc(B * T * sizeof(int));
-    // note: we definitely want to advance by B * T; That is the "stride" by which we move
-    // the window of tokens. We only load B * T + 1 tokens because our targets are offset by 1
-    loader->num_batches = ntok / (num_processes * B * T);
+    loader->num_batches = ntok_total / (num_processes * B * T); // useful to know
 
-    // reset the loader to the beginning of the file
+    // reset the loader, to initialize it
     dataloader_reset(loader);
 }
 
 void dataloader_next_batch(DataLoader *loader) {
     size_t B = loader->B;
     size_t T = loader->T;
-    // if we are at the end of the file, loop back to the beginning
-    if (loader->current_position + (loader->num_processes * B * T + 1) * sizeof(uint16_t) > loader->file_size) {
-        dataloader_reset(loader);
-    }
     // read B*T+1 uint16_t tokens from the file into buffer
     fseekCheck(loader->tokens_file, loader->current_position, SEEK_SET);
     freadCheck(loader->buffer, sizeof(uint16_t), B*T+1, loader->tokens_file);
@@ -111,7 +151,12 @@ void dataloader_next_batch(DataLoader *loader) {
     }
     // advance the current position by B*T*num_processes integers
     // note: the "stride" of tokens by which we move each time is definitely B * T
+    // we only load B * T + 1 tokens at each iteration because the targets are offset by 1
     loader->current_position += loader->num_processes * B * T * sizeof(uint16_t);
+    // if the next batch would go past the end of the file, advance the loader
+    if (loader->current_position + (loader->num_processes * B * T + 1) * sizeof(uint16_t) > loader->file_size) {
+        dataloader_advance_(loader);
+    }
 }
 
 void dataloader_free(DataLoader *loader) {
@@ -119,4 +164,5 @@ void dataloader_free(DataLoader *loader) {
     free(loader->inputs);
     free(loader->targets);
     fcloseCheck(loader->tokens_file);
+    globfree(&loader->glob_result);
 }
diff --git a/dev/data/fineweb.py b/dev/data/fineweb.py
index 41091ba6a..8369112a4 100644
--- a/dev/data/fineweb.py
+++ b/dev/data/fineweb.py
@@ -72,7 +72,8 @@ def tokenize(doc):
 
     # if we reach shard_size tokens, write shard to disk
     if len(all_tokens) >= args.shard_size:
-        filename = os.path.join(DATA_CACHE_DIR, f"fineweb_{shard_index:06d}.bin")
+        split = "val" if shard_index == 0 else "train"
+        filename = os.path.join(DATA_CACHE_DIR, f"fineweb_{split}_{shard_index:06d}.bin")
         write_tokens = all_tokens[:args.shard_size]
         rest_tokens = all_tokens[args.shard_size:]
         write_datafile(filename, write_tokens)
diff --git a/train_gpt2.cu b/train_gpt2.cu
index f3dfca5c2..4151b22c5 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2541,12 +2541,10 @@ void logger_free(Logger *logger) {
 // CLI, poor man's argparse
 
 void error_usage() {
-    // default run = debugging run with TinyShakespeare
-    // bigger run = train on TinyStories! e.g. val/sample less often, but sample more tokens, write to logfile
     fprintf(stderr, "Usage:   ./train_gpt2cu [options]\n");
-    fprintf(stderr, "Example: ./train_gpt2cu -i dev/data/tinystories/TinyStories -v 100 -s 100 -g 144 -o stories.log\n");
     fprintf(stderr, "Options:\n");
-    fprintf(stderr, "  -i <string> input dataset prefix (default = dev/data/tinyshakespeare/tiny_shakespeare)\n");
+    fprintf(stderr, "  -i <string> train data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_train.bin)\n");
+    fprintf(stderr, "  -j <string> val data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_val.bin)\n");
     fprintf(stderr, "  -e <string> input model filename (default = gpt2_124M_bf16.bin)\n");
     fprintf(stderr, "  -o <string> output log file (default = NULL)\n");
     fprintf(stderr, "  -b <int>    (per-GPU, micro) batch size B (default = 4)\n");
@@ -2572,7 +2570,8 @@ int main(int argc, char *argv[]) {
     multi_gpu_config = multi_gpu_config_init(&argc, &argv);
 
     // read in the (optional) command line arguments
-    const char* input_dataset_prefix = "dev/data/tinyshakespeare/tiny_shakespeare"; // or e.g. data/TinyStories
+    const char* train_data_pattern = "dev/data/tinyshakespeare/tiny_shakespeare_train.bin";
+    const char* val_data_pattern = "dev/data/tinyshakespeare/tiny_shakespeare_val.bin";
     const char* load_filename = "gpt2_124M_bf16.bin"; // bf16 weights of the model
     const char* output_log_file = NULL;
     int B = 4; // batch size
@@ -2595,7 +2594,8 @@ int main(int argc, char *argv[]) {
         if (argv[i][0] != '-') { error_usage(); } // must start with dash
         if (strlen(argv[i]) != 2) { error_usage(); } // must be -x (one dash, one letter)
         // read in the args
-        if (argv[i][1] == 'i') { input_dataset_prefix = argv[i+1]; }
+        if (argv[i][1] == 'i') { train_data_pattern = argv[i+1]; }
+        else if (argv[i][1] == 'j') { val_data_pattern = argv[i+1]; }
         else if (argv[i][1] == 'e') { load_filename = argv[i+1]; }
         else if (argv[i][1] == 'o') { output_log_file = argv[i+1]; }
         else if (argv[i][1] == 'b') { B = atoi(argv[i+1]); } // Per-GPU (micro) batch size
@@ -2617,10 +2617,14 @@ int main(int argc, char *argv[]) {
     }
     // calculate a sensible default for total batch size by assuming no gradient accumulation
     if (total_batch_size == -1) { total_batch_size = B * T * multi_gpu_config.num_processes; }
+    // if we're only overfitting a single batch for debugging, let's overfit the first batch
+    // from val instead of train split, because val is smaller and faster. (train_gpt2.py does the same)
+    if (overfit_single_batch == 1) { train_data_pattern = val_data_pattern; }
     printf0("+-----------------------+----------------------------------------------------+\n");
     printf0("| Parameter             | Value                                              |\n");
     printf0("+-----------------------+----------------------------------------------------+\n");
-    printf0("| input dataset prefix  | %-50s |\n", input_dataset_prefix);
+    printf0("| train data pattern    | %-50s |\n", train_data_pattern);
+    printf0("| val data pattern      | %-50s |\n", val_data_pattern);
     printf0("| output log file       | %-50s |\n", output_log_file == NULL ? "NULL" : output_log_file);
     printf0("| micro batch size B    | %-50d |\n", B);
     printf0("| sequence length T     | %-50d |\n", T);
@@ -2663,16 +2667,9 @@ int main(int argc, char *argv[]) {
     printf0("+-----------------------+----------------------------------------------------+\n");
 
     // build DataLoaders for both train and val
-    char train_tokens_filename[128], val_tokens_filename[128];
-    assert(strlen(input_dataset_prefix) < 100); // being bit lazy here, make sure we don't overflow
-    // if we're only overfitting a single batch for debugging, let's overfit the first batch
-    // from val instead of train split, because val is smaller and a bit faster
-    const char* train_split = (overfit_single_batch == 1) ? "val" : "train";
-    sprintf(train_tokens_filename, "%s_%s.bin", input_dataset_prefix, train_split);
-    sprintf(val_tokens_filename, "%s_val.bin", input_dataset_prefix);
     DataLoader train_loader, val_loader;
-    dataloader_init(&train_loader, train_tokens_filename, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes);
-    dataloader_init(&val_loader, val_tokens_filename, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes);
+    dataloader_init(&train_loader, train_data_pattern, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes);
+    dataloader_init(&val_loader, val_data_pattern, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes);
     int train_num_batches = (max_steps == -1) ? train_loader.num_batches : max_steps; // default = 1 epoch
     int val_num_batches = train_loader.num_batches < val_max_batches ? train_loader.num_batches : val_max_batches;
     printf0("| train_num_batches     | %-50d |\n", train_num_batches);

From 7d0891f6ddebebeefc8a9a5c3f319484aa31f1d5 Mon Sep 17 00:00:00 2001
From: ademeure <arundemeure@gmail.com>
Date: Tue, 21 May 2024 22:37:09 +0100
Subject: [PATCH 135/172] Fully deterministic layernorm (slight perf loss)

---
 train_gpt2.cu | 110 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 82 insertions(+), 28 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 1e8b54be2..6c60b8a74 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -980,30 +980,34 @@ __global__ void reduce_add_sum_kernel(floatX* dst, const float* src, size_t n, s
     }
 }
 
-__global__ void __launch_bounds__(512, 3) // todo - any warnings on Turing with only 1024 threads?
-                layernorm_backward_kernel8(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
+__global__ void __launch_bounds__(512, 2) // todo - any warnings on Turing with only 1024 threads?
+                layernorm_backward_kernel9(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
                                             const floatX* dout, const floatX* inp, const floatX* weight,
                                             const floatX* mean, const floatX* rstd,
                                             int B, int T, int C) {
+    constexpr int BLOCK_SIZE = 512;
+    constexpr int warpsInBlock = BLOCK_SIZE / WARP_SIZE; //number of warps in block
     extern __shared__ float shared[]; // size = 2 * C + 1
+
     int warpId = threadIdx.x / WARP_SIZE; // warp index within a block
-    int warpsInBlock = blockDim.x / WARP_SIZE; //number of warps in block
     int baseIdx = blockIdx.x * warpsInBlock + warpId;
     int warpThreadIdx = threadIdx.x % WARP_SIZE; // Thread index within the warp
     int warpsInGrid = gridDim.x * warpsInBlock;
     int C_per_iteration = WARP_SIZE * x128::size;
-    int iterations_C = C / C_per_iteration;
+    int iterations_C = CEIL_DIV(C, C_per_iteration);
 
     // the first half of shared memory is bias, second is weight
     float* dbias_shared = shared;
     float* dweight_shared = shared + C;
+    float* dbias_tmp_shared = shared + 2 * C;
+    float* dweight_tmp_shared = shared + 2 * C + BLOCK_SIZE;
 
     // init shared memory to zero
-    for(int i = threadIdx.x; i < C; i+= blockDim.x){
+    for(int i = threadIdx.x; i < C; i+= BLOCK_SIZE){
        dbias_shared[i] = 0.0f;
        dweight_shared[i] = 0.0f;
     }
-    unsigned int *tmp_flag = (unsigned int*)(shared + C*2);
+    unsigned int *tmp_flag = (unsigned int*)(shared + 2*C + 2*BLOCK_SIZE);
     __syncthreads();
 
     for (int idx = baseIdx; idx < B * T; idx += warpsInGrid) {
@@ -1041,6 +1045,10 @@ __global__ void __launch_bounds__(512, 3) // todo - any warnings on Turing with
         for (int i = 0; i < iterations_C; i++) {
             int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration);
             int shared_index = warpThreadIdx + (i * C_per_iteration);
+            if (global_index >= C) {
+                break;
+            }
+
             x128 dout128   = load128cs(dout_bt + global_index);
             x128 inp128    = load128cs(inp_bt  + global_index);
             x128 dinp128   = load128(dinp_bt   + global_index);
@@ -1050,10 +1058,29 @@ __global__ void __launch_bounds__(512, 3) // todo - any warnings on Turing with
                 float dout_i = (float)dout128[x];
                 float norm_bti = ((float)inp128[x] - mean_bt) * rstd_bt;
                 float dnorm_i = (float)weight128[x] * dout_i;
-                // gradient contribution to bias (using shared memory friendly index)
-                atomicAdd(&dbias_shared[shared_index + x*WARP_SIZE], dout_i);
-                // gradient contribution to weight (using shared memory friendly index)
-                atomicAdd(&dweight_shared[shared_index + x*WARP_SIZE], norm_bti * dout_i);
+
+                // sum up the gradients for bias and weight across the entire block
+                // this is basically a reduction (but only inter-warp, not intra-warp)
+                // doing it this way allows us to avoid using atomics while using many warps
+                if (warpId != 0) {
+                    dbias_tmp_shared[threadIdx.x] = dout_i;
+                    dweight_tmp_shared[threadIdx.x] = norm_bti * dout_i;
+                }
+                __syncthreads();
+                if (warpId == 0) {
+                    float dbias_tmp = dout_i;
+                    float dweight_tmp = norm_bti * dout_i;
+                    for (int j = 1; j < warpsInBlock; j++) {
+                        dbias_tmp += dbias_tmp_shared[threadIdx.x + j * WARP_SIZE];
+                        dweight_tmp += dweight_tmp_shared[threadIdx.x + j * WARP_SIZE];
+                    }
+                    // gradient contribution to bias (using shared memory friendly index)
+                    dbias_shared[shared_index + x*WARP_SIZE] += dbias_tmp;
+                    // gradient contribution to weight (using shared memory friendly index)
+                    dweight_shared[shared_index + x*WARP_SIZE] += dweight_tmp;
+                }
+                __syncthreads();
+
                 // gradient contribution to input
                 float dval = 0.0f;
                 dval += dnorm_i; // term 1
@@ -1066,35 +1093,64 @@ __global__ void __launch_bounds__(512, 3) // todo - any warnings on Turing with
             store128cg(dinp_bt + global_index, dinp128);
         }
     }
-    // Accumulate into a FP32 scratchpad
-    // BF16 atomics are potentially much slower... and this is more precise!
-    // todo - could potentially avoid the extra copy if floatX is FP32, fairly negligible though
     __syncthreads();
+    // Each block writes its partial sum to global memory
+    // The last block to finish becomes responsible for summing up all the partial sums
+    // This is done by atomically incrementing a flag (cleared to 0 before launching the kernel)
+    unsigned int* scratchFlag = (unsigned int*)(scratch);
+    // Increment scratch pointer by a full cacheline so that everything remains cacheline aligned
+    scratch += 32;
     float* scratch_dbias = scratch;
     float* scratch_dweight = scratch + C;
-    unsigned int* scratchFlag = (unsigned int*)(scratch + (2 * C));
-    for(int i = threadIdx.x; i < C; i+= blockDim.x) {
-        // global atomics in the same "shared memory banking friendly" order
-        atomicAdd(&scratch_dbias[i], dbias_shared[i]);
-        atomicAdd(&scratch_dweight[i], dweight_shared[i]);
+    for(int i = threadIdx.x; i < C; i+= BLOCK_SIZE) {
+        // Write to global memory in the same "shared memory banking friendly" order
+        scratch_dbias[i + 2*C*blockIdx.x] = dbias_shared[i];
+        scratch_dweight[i + 2*C*blockIdx.x] = dweight_shared[i];
     }
+
     __syncthreads();
     if (threadIdx.x == 0) {
         *tmp_flag = atomicInc(scratchFlag, gridDim.x);
     }
     __syncthreads();
     if (*tmp_flag == gridDim.x-1) {
+        // Reduction of the partial sums by the final block
+        // todo - there isn't enough parallelism even inside that single SM...
+        // ==> so could maybe split into another kernel with YET ANOTHER level of reduction?!
+        for(int i = threadIdx.x * f128::size; i < C; i+= BLOCK_SIZE * f128::size) {
+            f128 dbias_accum(make_int4(0, 0, 0, 0));
+            f128 dweight_accum(make_int4(0, 0, 0, 0));
+
+            for (int read_block_idx = 0; read_block_idx < gridDim.x; read_block_idx++) {
+                int offset = i + 2*C*read_block_idx;
+                f128 dbias128 = load128(scratch_dbias + offset);
+                f128 dweight128 = load128(scratch_dweight + offset);
+                for(int k = 0; k < f128::size; k++) {
+                    dbias_accum[k] += dbias128[k];
+                    dweight_accum[k] += dweight128[k];
+                }
+            }
+            store128(dbias_shared + i, dbias_accum);
+            store128(dweight_shared + i, dweight_accum);
+        }
+        __syncthreads();
+
+        // reorder from atomic/shared memory-friendly index to real global memory index
+        // and convert from float/FP32 to floatX/BF16 for the final write
+        // this is separate also because it cannot use as many warps as the above (f128 vs x128)
+        // todo - if we split this code into another kernel, we could maybe do it at the same time?
         for (int i = warpId; i < iterations_C; i += warpsInBlock) {
-            // reorder from atomic/shared memory-friendly index to real global memory index
-            // and convert from float/FP32 to floatX/BF16 for the final write
             int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration);
             int shared_index = warpThreadIdx + (i * C_per_iteration);
+            if (global_index >= C) {
+                break;
+            }
 
             x128 dbias128 = load128(dbias + global_index);
             x128 dweight128 = load128(dweight + global_index);
             for (int x = 0; x < x128::size; x++) {
-                float s_db = scratch_dbias[shared_index + x*WARP_SIZE];
-                float s_dw = scratch_dweight[shared_index + x*WARP_SIZE];
+                float s_db = dbias_shared[shared_index + x*WARP_SIZE];
+                float s_dw = dweight_shared[shared_index + x*WARP_SIZE];
                 dbias128[x] = (floatX)(s_db + (float)dbias128[x]);
                 dweight128[x] = (floatX)(s_dw + (float)dweight128[x]);
             }
@@ -1603,15 +1659,13 @@ void layernorm_backward(floatX* dinp, floatX* dweight, floatX* dbias, float* scr
                         const floatX* dout, const floatX* inp, const floatX* weight, const floatX* mean, const floatX* rstd,
                         int B, int T, int C) {
     NVTX_RANGE_FN();
-    // todo - forcing 3 x 512 threads per SM maximum is a bit hacky, but more than that results in
-    // cache thrashing and lower performance on A100... is there a better way?
     const int block_size = 512;
-    const int blocks_per_sm = min(3, (deviceProp.maxThreadsPerMultiProcessor / 1024));
+    const int blocks_per_sm = 2; // supported on every architecture and less cache thrashing than 3
     const int grid_size = blocks_per_sm * deviceProp.multiProcessorCount;
-    size_t shared_mem_size = (2 * C + 1) * sizeof(float);
+    size_t shared_mem_size = (2*C + 2*block_size + 1) * sizeof(float);  // see kernel
 
-    cudaMemset(scratch, 0, (2 * C + 1) * sizeof(float));
-    layernorm_backward_kernel8<<<grid_size, block_size, shared_mem_size>>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C);
+    cudaMemset(scratch, 0, 1 * sizeof(float)); // only need to reset the flag to 0
+    layernorm_backward_kernel9<<<grid_size, block_size, shared_mem_size>>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C);
     cudaCheck(cudaGetLastError());
 }
 

From 7cbeefc7f371412bbaca2990abbf5873bb8547ae Mon Sep 17 00:00:00 2001
From: ademeure <arundemeure@gmail.com>
Date: Tue, 21 May 2024 23:26:54 +0100
Subject: [PATCH 136/172] added new layernorm backward to /dev/cuda/

---
 dev/cuda/layernorm_backward.cu | 199 ++++++++++++++++++++++++++++++++-
 train_gpt2.cu                  |  20 ++--
 2 files changed, 206 insertions(+), 13 deletions(-)

diff --git a/dev/cuda/layernorm_backward.cu b/dev/cuda/layernorm_backward.cu
index 90dcb1674..d9502880b 100644
--- a/dev/cuda/layernorm_backward.cu
+++ b/dev/cuda/layernorm_backward.cu
@@ -856,6 +856,185 @@ __global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS)
     }
 }
 
+__global__ void layernorm_backward_kernel9(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
+                                            const floatX* dout, const floatX* inp, const floatX* weight,
+                                            const floatX* mean, const floatX* rstd,
+                                            int B, int T, int C) {
+    constexpr int WARP_SIZE = 32;
+    int BLOCK_SIZE = blockDim.x;
+    int warpsInBlock = BLOCK_SIZE / WARP_SIZE; //number of warps in block
+    extern __shared__ float shared[]; // size = 2 * C + 1
+
+    int warpId = threadIdx.x / WARP_SIZE; // warp index within a block
+    int baseIdx = blockIdx.x * warpsInBlock + warpId;
+    int warpThreadIdx = threadIdx.x % WARP_SIZE; // Thread index within the warp
+    int warpsInGrid = gridDim.x * warpsInBlock;
+    int C_per_iteration = WARP_SIZE * x128::size;
+    int iterations_C = ceil_div(C, C_per_iteration) + 2;
+
+    // the first half of shared memory is bias, second is weight
+    float* dbias_shared = shared;
+    float* dweight_shared = shared + C;
+    float* dbias_tmp_shared = shared + 2 * C;
+    float* dweight_tmp_shared = shared + 2 * C + BLOCK_SIZE;
+
+    // init shared memory to zero
+    for(int i = threadIdx.x; i < C; i+= BLOCK_SIZE){
+       dbias_shared[i] = 0.0f;
+       dweight_shared[i] = 0.0f;
+    }
+    unsigned int *tmp_flag = (unsigned int*)(shared + 2*C + 2*BLOCK_SIZE);
+    __syncthreads();
+
+    for (int idx = baseIdx; idx < B * T; idx += warpsInGrid) {
+        int b = idx / T;
+        int t = idx % T;
+
+        const floatX* dout_bt = dout + b * T * C + t * C;
+        const floatX* inp_bt = inp + b * T * C + t * C;
+        floatX* dinp_bt = dinp + b * T * C + t * C;
+        const float mean_bt = (float)mean[b * T + t];
+        const float rstd_bt = (float)rstd[b * T + t];
+
+        // first: two reduce operations
+        float dnorm_mean = 0.0f;
+        float dnorm_norm_mean = 0.0f;
+        for (int i = warpThreadIdx * x128::size; i < C; i += WARP_SIZE * x128::size) {
+            x128 dout128_i   = load128(dout_bt + i);
+            x128 inp128_i    = load128(inp_bt  + i);
+            x128 weight128_i = load128(weight  + i);
+            for (int k = 0; k < x128::size; k++) {
+                float norm_bti = ((float)inp128_i[k] - mean_bt) * rstd_bt;
+                float dnorm_i = (float)weight128_i[k] * (float)dout128_i[k];
+                dnorm_mean += dnorm_i;
+                dnorm_norm_mean += dnorm_i * norm_bti;
+            }
+        }
+        dnorm_mean = warpReduceSum(dnorm_mean) / C;
+        dnorm_norm_mean = warpReduceSum(dnorm_norm_mean) / C;
+
+        // now iterate again and accumulate all the gradients
+        // unfortunately we cannot use the same index for x128 arrays and shared memory
+        // as atomics can only be 32-bit rather than 128-bit (at least pre-SM90/Hopper)
+        // so this would result in an 8-way bank conflict, and kill performance
+        // so instead, we use a shared memory friendly index, and reorder before the final write
+        for (int i = 0; i < iterations_C; i++) {
+            int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration);
+            int shared_index = warpThreadIdx + (i * C_per_iteration);
+            if (global_index >= C) {
+                break;
+            }
+
+            x128 dout128   = load128cs(dout_bt + global_index);
+            x128 inp128    = load128cs(inp_bt  + global_index);
+            x128 dinp128   = load128(dinp_bt   + global_index);
+            x128 weight128 = load128(weight    + global_index);
+
+            for (int x = 0; x < x128::size; x++) {
+                float dout_i = (float)dout128[x];
+                float norm_bti = ((float)inp128[x] - mean_bt) * rstd_bt;
+                float dnorm_i = (float)weight128[x] * dout_i;
+
+                // sum up the gradients for bias and weight across the entire block
+                // this is basically a reduction (but only inter-warp, not intra-warp)
+                // doing it this way allows us to avoid using atomics while using many warps
+                if (warpId != 0) {
+                    dbias_tmp_shared[threadIdx.x] = dout_i;
+                    dweight_tmp_shared[threadIdx.x] = norm_bti * dout_i;
+                }
+                __syncthreads();
+                if (warpId == 0) {
+                    float dbias_tmp = dout_i;
+                    float dweight_tmp = norm_bti * dout_i;
+                    for (int j = 1; j < warpsInBlock; j++) {
+                        dbias_tmp += dbias_tmp_shared[threadIdx.x + j * WARP_SIZE];
+                        dweight_tmp += dweight_tmp_shared[threadIdx.x + j * WARP_SIZE];
+                    }
+                    // gradient contribution to bias (using shared memory friendly index)
+                    dbias_shared[shared_index + x*WARP_SIZE] += dbias_tmp;
+                    // gradient contribution to weight (using shared memory friendly index)
+                    dweight_shared[shared_index + x*WARP_SIZE] += dweight_tmp;
+                }
+                __syncthreads();
+
+                // gradient contribution to input
+                float dval = 0.0f;
+                dval += dnorm_i; // term 1
+                dval -= dnorm_mean; // term 2
+                dval -= norm_bti * dnorm_norm_mean; // term 3
+                dval *= rstd_bt; // final scale
+                dinp128[x] = (floatX)((float)dinp128[x] + dval);
+            }
+            // cache in L2 as this is read by the next kernel, but bypass L1 to minimise thrashing
+            store128cg(dinp_bt + global_index, dinp128);
+        }
+    }
+    __syncthreads();
+    // Each block writes its partial sum to global memory
+    // The last block to finish becomes responsible for summing up all the partial sums
+    // This is done by atomically incrementing a flag (cleared to 0 before launching the kernel)
+    unsigned int* scratchFlag = (unsigned int*)(scratch);
+    // Increment scratch pointer by a full cacheline so that everything remains cacheline aligned
+    scratch += 32;
+    float* scratch_dbias = scratch;
+    float* scratch_dweight = scratch + C;
+    for(int i = threadIdx.x; i < C; i+= BLOCK_SIZE) {
+        // Write to global memory in the same "shared memory banking friendly" order
+        scratch_dbias[i + 2*C*blockIdx.x] = dbias_shared[i];
+        scratch_dweight[i + 2*C*blockIdx.x] = dweight_shared[i];
+    }
+    __syncthreads();
+    if (threadIdx.x == 0) {
+        *tmp_flag = atomicInc(scratchFlag, gridDim.x);
+    }
+    __syncthreads();
+    if (*tmp_flag == gridDim.x-1) {
+        // Reduction of the partial sums by the final block
+        // todo - there isn't enough parallelism even inside that single SM...
+        // ==> so could maybe split into another kernel with YET ANOTHER level of reduction?!
+        for(int i = threadIdx.x * f128::size; i < C; i+= BLOCK_SIZE * f128::size) {
+            f128 dbias_accum(make_int4(0, 0, 0, 0));
+            f128 dweight_accum(make_int4(0, 0, 0, 0));
+
+            for (int read_block_idx = 0; read_block_idx < gridDim.x; read_block_idx++) {
+                int offset = i + 2*C*read_block_idx;
+                f128 dbias128 = load128(scratch_dbias + offset);
+                f128 dweight128 = load128(scratch_dweight + offset);
+                for(int k = 0; k < f128::size; k++) {
+                    dbias_accum[k] += dbias128[k];
+                    dweight_accum[k] += dweight128[k];
+                }
+            }
+            store128(dbias_shared + i, dbias_accum);
+            store128(dweight_shared + i, dweight_accum);
+        }
+        __syncthreads();
+
+        // reorder from atomic/shared memory-friendly index to real global memory index
+        // and convert from float/FP32 to floatX/BF16 for the final write
+        // this is separate also because it cannot use as many warps as the above (f128 vs x128)
+        // todo - if we split this code into another kernel, we could maybe do it at the same time?
+        for (int i = warpId; i < iterations_C; i += warpsInBlock) {
+            int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration);
+            int shared_index = warpThreadIdx + (i * C_per_iteration);
+            if (global_index >= C) {
+                break;
+            }
+
+            x128 dbias128 = load128(dbias + global_index);
+            x128 dweight128 = load128(dweight + global_index);
+            for (int x = 0; x < x128::size; x++) {
+                float s_db = dbias_shared[shared_index + x*WARP_SIZE];
+                float s_dw = dweight_shared[shared_index + x*WARP_SIZE];
+                dbias128[x] = (floatX)(s_db + (float)dbias128[x]);
+                dweight128[x] = (floatX)(s_dw + (float)dweight128[x]);
+            }
+            store128(dbias + global_index, dbias128);
+            store128(dweight + global_index, dweight128);
+        }
+    }
+}
+
 // ----------------------------------------------------------------------------
 // kernel launchers
 
@@ -947,6 +1126,18 @@ void layernorm_backward8(Tdinp* dinp, Tparams* dweight, Tparams* dbias, float* s
         layernorm_backward_kernel8<<<grid_size, block_size, shared_mem_size>>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C);
 }
 
+template <typename Tdinp, typename Tparams, typename Tdout, typename Trest>
+void layernorm_backward9(Tdinp* dinp, Tparams* dweight, Tparams* dbias, float* scratch,
+                        const Tdout* dout, const Trest* inp, const Tparams* weight, const Trest* mean, const Trest* rstd,
+                        int B, int T, int C, int block_size) {
+
+        const int grid_size = (1024/block_size) * cuda_num_SMs; // todo - heuristics for other GPUs?
+        size_t shared_mem_size = (2 * C + 2 * block_size + 1) * sizeof(float);
+
+        cudaMemset(scratch, 0, 1 * sizeof(float)); // just need to memset the flag for this version
+        layernorm_backward_kernel9<<<grid_size, block_size, shared_mem_size>>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C);
+}
+
 // kernel version dispatch
 void layernorm_backward(int kernel_num,
                         floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
@@ -982,6 +1173,9 @@ void layernorm_backward(int kernel_num,
         case 8:
             layernorm_backward8(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C, block_size);
             break;
+        case 9:
+            layernorm_backward9(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C, block_size);
+            break;
     default:
             printf("Invalid kernel number\n");
             exit(1);
@@ -1042,7 +1236,7 @@ int main(int argc, char **argv) {
     cudaCheck(cudaMalloc(&d_weight, C * sizeof(floatX)));
     cudaCheck(cudaMalloc(&d_mean, B * T * sizeof(floatX)));
     cudaCheck(cudaMalloc(&d_rstd, B * T * sizeof(floatX)));
-    cudaCheck(cudaMalloc(&d_scratch, cuda_num_SMs * (2 * C + 1) * sizeof(float)));
+    cudaCheck(cudaMalloc(&d_scratch, (1024/32) * cuda_num_SMs * (2 * C + 1) * sizeof(float)));
     // copy over the "inputs" to the backward call
     cudaCheck(memcpy_convert(d_dout, dout, B * T * C));
     cudaCheck(memcpy_convert(d_inp, inp, B * T * C));
@@ -1051,7 +1245,8 @@ int main(int argc, char **argv) {
     cudaCheck(memcpy_convert(d_rstd, rstd, B * T));
 
     // launch the kernel
-    int block_sizes[] = {32, 64, 128, 256, 512, 768, 1024};
+    // removed 768 because it doesn't work for kernel9 despite being OK in train_gpt2.cu?!
+    int block_sizes[] = {32, 64, 128, 256, 512, /*768,*/ 1024};
     for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
         int block_size = block_sizes[j];
         // init the "outputs" of the backward call to zeros
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 6c60b8a74..31b6db2b7 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -985,10 +985,8 @@ __global__ void __launch_bounds__(512, 2) // todo - any warnings on Turing with
                                             const floatX* dout, const floatX* inp, const floatX* weight,
                                             const floatX* mean, const floatX* rstd,
                                             int B, int T, int C) {
-    constexpr int BLOCK_SIZE = 512;
-    constexpr int warpsInBlock = BLOCK_SIZE / WARP_SIZE; //number of warps in block
-    extern __shared__ float shared[]; // size = 2 * C + 1
-
+    extern __shared__ float shared[]; // size = 2*C + 2*block_size + 1
+    int warpsInBlock = blockDim.x / WARP_SIZE; //number of warps in block
     int warpId = threadIdx.x / WARP_SIZE; // warp index within a block
     int baseIdx = blockIdx.x * warpsInBlock + warpId;
     int warpThreadIdx = threadIdx.x % WARP_SIZE; // Thread index within the warp
@@ -1000,14 +998,14 @@ __global__ void __launch_bounds__(512, 2) // todo - any warnings on Turing with
     float* dbias_shared = shared;
     float* dweight_shared = shared + C;
     float* dbias_tmp_shared = shared + 2 * C;
-    float* dweight_tmp_shared = shared + 2 * C + BLOCK_SIZE;
+    float* dweight_tmp_shared = shared + 2 * C + blockDim.x;
 
     // init shared memory to zero
-    for(int i = threadIdx.x; i < C; i+= BLOCK_SIZE){
+    for(int i = threadIdx.x; i < C; i+= blockDim.x){
        dbias_shared[i] = 0.0f;
        dweight_shared[i] = 0.0f;
     }
-    unsigned int *tmp_flag = (unsigned int*)(shared + 2*C + 2*BLOCK_SIZE);
+    unsigned int *tmp_flag = (unsigned int*)(shared + 2*C + 2*blockDim.x);
     __syncthreads();
 
     for (int idx = baseIdx; idx < B * T; idx += warpsInGrid) {
@@ -1102,12 +1100,14 @@ __global__ void __launch_bounds__(512, 2) // todo - any warnings on Turing with
     scratch += 32;
     float* scratch_dbias = scratch;
     float* scratch_dweight = scratch + C;
-    for(int i = threadIdx.x; i < C; i+= BLOCK_SIZE) {
+    for(int i = threadIdx.x; i < C; i+= blockDim.x) {
         // Write to global memory in the same "shared memory banking friendly" order
         scratch_dbias[i + 2*C*blockIdx.x] = dbias_shared[i];
         scratch_dweight[i + 2*C*blockIdx.x] = dweight_shared[i];
     }
 
+    // todo - everything below could become a separate kernel for better performance with maybe less code
+    // not enough parallelism even inside that single SM... do we need another level of reduction?!
     __syncthreads();
     if (threadIdx.x == 0) {
         *tmp_flag = atomicInc(scratchFlag, gridDim.x);
@@ -1115,9 +1115,7 @@ __global__ void __launch_bounds__(512, 2) // todo - any warnings on Turing with
     __syncthreads();
     if (*tmp_flag == gridDim.x-1) {
         // Reduction of the partial sums by the final block
-        // todo - there isn't enough parallelism even inside that single SM...
-        // ==> so could maybe split into another kernel with YET ANOTHER level of reduction?!
-        for(int i = threadIdx.x * f128::size; i < C; i+= BLOCK_SIZE * f128::size) {
+        for(int i = threadIdx.x * f128::size; i < C; i+= blockDim.x * f128::size) {
             f128 dbias_accum(make_int4(0, 0, 0, 0));
             f128 dweight_accum(make_int4(0, 0, 0, 0));
 

From edb0df967a6b3a3dfa7f7e3b440bdf3c2a4d7d7e Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Wed, 22 May 2024 11:59:09 +0000
Subject: [PATCH 137/172] continued changes for sharded dataloader

---
 dataloader.h        | 16 +++++++--------
 dev/data/fineweb.py | 50 +++++++++++++++++++++++++--------------------
 train_gpt2.cu       |  2 +-
 train_gpt2_fp32.cu  | 24 +++++++++-------------
 4 files changed, 47 insertions(+), 45 deletions(-)

diff --git a/dataloader.h b/dataloader.h
index a9864fbe7..93019317f 100644
--- a/dataloader.h
+++ b/dataloader.h
@@ -16,8 +16,8 @@ Implements a medium simple DataLoader for a distributed training setup.
 #define HEADER_SIZE 256
 
 typedef struct {
-    // Distributed data parallel specifics.
-    // Each worker loads it's own chunk of data.
+    // variables related to distributed training
+    // each process/worker has to access different parts of the data
     int process_rank;
     int num_processes;
     // hyperparameters. use size_t to prevent overflow
@@ -29,12 +29,11 @@ typedef struct {
     FILE* tokens_file;
     long file_size;
     long current_position;
-    // outputs
-    uint16_t* buffer; // used to fread data from file into
+    uint16_t* buffer; // we fread data from file into this buffer
+    // public variables that could be accessed from outside
+    size_t num_batches;
     int* inputs;  // input tokens into transformer
     int* targets; // target tokens for the transformer
-    // convenience variables
-    size_t num_batches;
 } DataLoader;
 
 long dataloader_load_shard_(DataLoader *loader, int shard_index) {
@@ -125,8 +124,9 @@ void dataloader_init(DataLoader *loader,
         assert(shard_ntok >= num_processes * B * T + 1);
         ntok_total += shard_ntok;
     }
-    printf("DataLoader: filename_pattern: %s\n", filename_pattern);
-    printf("DataLoader: Found %ld tokens across %zu shards\n", ntok_total, loader->glob_result.gl_pathc);
+    // debugging prints
+    // printf("DataLoader: filename_pattern: %s\n", filename_pattern);
+    // printf("DataLoader: Found %ld tokens across %zu shards\n", ntok_total, loader->glob_result.gl_pathc);
 
     // allocate all the space we'll need
     loader->buffer = (uint16_t*)malloc((B * T + 1) * sizeof(uint16_t));
diff --git a/dev/data/fineweb.py b/dev/data/fineweb.py
index 8369112a4..9b8863dac 100644
--- a/dev/data/fineweb.py
+++ b/dev/data/fineweb.py
@@ -55,30 +55,36 @@ def tokenize(doc):
     return enc.encode_ordinary(doc["text"])
 
 # main loop write files
-pool = mp.Pool()
-shard_index = 0
-all_tokens = []
-progress_bar = None
-for tokens in pool.imap(tokenize, fw):
+with mp.Pool() as pool:
+    shard_index = 0
+    all_tokens = []
+    progress_bar = None
+    for tokens in pool.imap(tokenize, fw):
 
-    # record the tokens and make sure to separate documents
-    all_tokens.append(eot)
-    all_tokens.extend(tokens)
+        # record the tokens and make sure to separate documents
+        all_tokens.append(eot)
+        all_tokens.extend(tokens)
 
-    # update progress bar
-    if progress_bar is None:
-        progress_bar = tqdm(total=args.shard_size, unit="tokens", desc=f"Shard {shard_index}")
-    progress_bar.update(len(tokens))
+        # update progress bar
+        if progress_bar is None:
+            progress_bar = tqdm(total=args.shard_size, unit="tokens", desc=f"Shard {shard_index}")
+        progress_bar.update(len(tokens))
 
-    # if we reach shard_size tokens, write shard to disk
-    if len(all_tokens) >= args.shard_size:
+        # if we reach shard_size tokens, write shard to disk
+        if len(all_tokens) >= args.shard_size:
+            split = "val" if shard_index == 0 else "train"
+            filename = os.path.join(DATA_CACHE_DIR, f"fineweb_{split}_{shard_index:06d}.bin")
+            write_tokens = all_tokens[:args.shard_size]
+            rest_tokens = all_tokens[args.shard_size:]
+            write_datafile(filename, write_tokens)
+            shard_index += 1
+            progress_bar = None
+            # note: create a copy so Python can free the all_tokens memory above
+            # the list rest_tokens is expected to be very small
+            all_tokens = [t for t in rest_tokens]
+
+    # write any remaining tokens as the last shard
+    if len(all_tokens) > 0:
         split = "val" if shard_index == 0 else "train"
         filename = os.path.join(DATA_CACHE_DIR, f"fineweb_{split}_{shard_index:06d}.bin")
-        write_tokens = all_tokens[:args.shard_size]
-        rest_tokens = all_tokens[args.shard_size:]
-        write_datafile(filename, write_tokens)
-        shard_index += 1
-        progress_bar = None
-        # note: create a copy so Python can free the all_tokens memory above
-        # the list rest_tokens is expected to be very small
-        all_tokens = [t for t in rest_tokens]
+        write_datafile(filename, all_tokens)
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 4151b22c5..578e7c80d 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2545,7 +2545,7 @@ void error_usage() {
     fprintf(stderr, "Options:\n");
     fprintf(stderr, "  -i <string> train data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_train.bin)\n");
     fprintf(stderr, "  -j <string> val data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_val.bin)\n");
-    fprintf(stderr, "  -e <string> input model filename (default = gpt2_124M_bf16.bin)\n");
+    fprintf(stderr, "  -e <string> input from model at this filename (default = gpt2_124M_bf16.bin)\n");
     fprintf(stderr, "  -o <string> output log file (default = NULL)\n");
     fprintf(stderr, "  -b <int>    (per-GPU, micro) batch size B (default = 4)\n");
     fprintf(stderr, "  -t <int>    sequence length T (default = 1024)\n");
diff --git a/train_gpt2_fp32.cu b/train_gpt2_fp32.cu
index 9a2dc6bb7..57697bc2f 100644
--- a/train_gpt2_fp32.cu
+++ b/train_gpt2_fp32.cu
@@ -1525,12 +1525,10 @@ void logger_free(Logger *logger) {
 // CLI, poor man's argparse
 
 void error_usage() {
-    // default run = debugging run with TinyShakespeare
-    // bigger run = train on TinyStories! e.g. val/sample less often, but sample more tokens, write to logfile
     fprintf(stderr, "Usage:   ./train_gpt2fp32cu [options]\n");
-    fprintf(stderr, "Example: ./train_gpt2fp32cu -i dev/data/tinystories/TinyStories -v 100 -s 100 -g 144 -o stories.log\n");
     fprintf(stderr, "Options:\n");
-    fprintf(stderr, "  -i <string> input dataset prefix (default = data/tiny_shakespeare)\n");
+    fprintf(stderr, "  -i <string> train data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_train.bin)\n");
+    fprintf(stderr, "  -j <string> val data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_val.bin)\n");
     fprintf(stderr, "  -o <string> output log file (default = NULL)\n");
     fprintf(stderr, "  -b <int>    batch size B (default = 4)\n");
     fprintf(stderr, "  -t <int>    sequence length T (default = 1024)\n");
@@ -1547,7 +1545,8 @@ void error_usage() {
 int main(int argc, char *argv[]) {
 
     // read in the (optional) command line arguments
-    const char* input_dataset_prefix = "dev/data/tinyshakespeare/tiny_shakespeare"; // or e.g. data/TinyStories
+    const char* train_data_pattern = "dev/data/tinyshakespeare/tiny_shakespeare_train.bin";
+    const char* val_data_pattern = "dev/data/tinyshakespeare/tiny_shakespeare_val.bin";
     const char* output_log_file = NULL;
     int B = 4; // batch size
     int T = 1024; // sequence length max
@@ -1561,7 +1560,8 @@ int main(int argc, char *argv[]) {
         if (argv[i][0] != '-') { error_usage(); } // must start with dash
         if (strlen(argv[i]) != 2) { error_usage(); } // must be -x (one dash, one letter)
         // read in the args
-        if (argv[i][1] == 'i') { input_dataset_prefix = argv[i+1]; }
+        if (argv[i][1] == 'i') { train_data_pattern = argv[i+1]; }
+        else if (argv[i][1] == 'j') { val_data_pattern = argv[i+1]; }
         else if (argv[i][1] == 'o') { output_log_file = argv[i+1]; }
         else if (argv[i][1] == 'b') { B = atoi(argv[i+1]); }
         else if (argv[i][1] == 't') { T = atoi(argv[i+1]); }
@@ -1575,7 +1575,8 @@ int main(int argc, char *argv[]) {
     printf("+-----------------------+----------------------------------------------------+\n");
     printf("| Parameter             | Value                                              |\n");
     printf("+-----------------------+----------------------------------------------------+\n");
-    printf("| input dataset prefix  | %-50s |\n", input_dataset_prefix);
+    printf("| train data pattern    | %-50s |\n", train_data_pattern);
+    printf("| val data pattern      | %-50s |\n", val_data_pattern);
     printf("| output log file       | %-50s |\n", output_log_file == NULL ? "NULL" : output_log_file);
     printf("| batch size B          | %-50d |\n", B);
     printf("| sequence length T     | %-50d |\n", T);
@@ -1617,14 +1618,9 @@ int main(int argc, char *argv[]) {
     printf("+-----------------------+----------------------------------------------------+\n");
 
     // build DataLoaders for both train and val
-    char train_tokens_filename[128];
-    char val_tokens_filename[128];
-    assert(strlen(input_dataset_prefix) < 100); // being bit lazy here, make sure we don't overflow
-    sprintf(train_tokens_filename, "%s_train.bin", input_dataset_prefix);
-    sprintf(val_tokens_filename, "%s_val.bin", input_dataset_prefix);
     DataLoader train_loader, val_loader;
-    dataloader_init(&train_loader, train_tokens_filename, B, T, 0, 1);
-    dataloader_init(&val_loader, val_tokens_filename, B, T, 0, 1);
+    dataloader_init(&train_loader, train_data_pattern, B, T, 0, 1);
+    dataloader_init(&val_loader, val_data_pattern, B, T, 0, 1);
     int train_num_batches = train_loader.num_batches; // let's do 1 epoch by default for now
     int val_num_batches = train_loader.num_batches < val_max_batches ? train_loader.num_batches : val_max_batches;
     printf("| train_num_batches     | %-50d |\n", train_num_batches);

From 05be4f6d825d3b3de813e8468c21d663a5755f03 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Wed, 22 May 2024 12:32:25 +0000
Subject: [PATCH 138/172] readme changes

---
 README.md | 104 ++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 69 insertions(+), 35 deletions(-)

diff --git a/README.md b/README.md
index a641afda0..f77870b25 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ make train_gpt2fp32cu
 ./train_gpt2fp32cu
 ```
 
-The above lines (1) download the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset, tokenize it with the GPT-2 Tokenizer, (2) download and save the GPT-2 (124M) weights, (3) init from them in C/CUDA and train for one epoch on tineshakespeare with AdamW (using batch size 4, context length 1024, total of 74 steps), evaluate validation loss, and sample some text. Note that in this quickstart we are using the fp32 version [train_gpt2_fp32.cu](train_gpt2_fp32.cu) of the CUDA code. Below in the CUDA section we document the current "mainline" [train_gpt2.cu](train_gpt2.cu), which is still being very actively developed, uses mixed precision, and runs ~2X faster.
+The above lines (1) download the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset, tokenize it with the GPT-2 Tokenizer, (2) download and save the GPT-2 (124M) weights, (3) init from them in C/CUDA and train for one epoch on tineshakespeare with AdamW (using batch size 4, context length 1024, total of 74 steps), evaluate validation loss, and sample some text. Note that in this quickstart we are using the fp32 version [train_gpt2_fp32.cu](train_gpt2_fp32.cu) of the CUDA code. In the next section we document the current "mainline" [train_gpt2.cu](train_gpt2.cu), which uses mixed precision, and runs ~2X faster.
 
 ## quick start (GPU, fast bleeding edge)
 
@@ -45,75 +45,102 @@ Note that the default batch size is very low (4). If you have enough memory on y
 ./train_gpt2cu -b 32
 ```
 
-My standard "prod" run with a nice GPU (e.g. A100 40GB) actually trains on TinyStories instead of TinyShakespeare, and looks like this:
+My standard single-GPU "prod" run (e.g. with a A100 40GB) trains on TinyStories instead of TinyShakespeare and looks like this, as an example:
 
 ```bash
 python dev/data/tinystories.py
 make train_gpt2cu USE_CUDNN=1
-./train_gpt2cu -i dev/data/tinystories/TinyStories -v 250 -s 250 -g 144 -o stories.log -b 32
+./train_gpt2cu -i dev/data/tinystories/TinyStories_train.bin \
+               -j dev/data/tinystories/TinyStories_val.bin \
+               -v 250 -s 250 -g 144 -o stories.log -b 32
 ```
 
-Where I decrease the frequency of validation loss and sampling to every 250 steps, sample 144 tokens during sampling stage (to fit ~one story), and at batch size 32.
+The `-i` flag is a glob pattern for the input data, `-j` for the val data. In addition I decrease the frequency of validation loss and sampling to every 250 steps, sample 144 tokens during sampling stage (to fit ~one story), and at batch size 32.
 
-## quick start (CPU)
-
-The "I am so GPU poor that I don't even have one" section. No worries, run:
+If you want to train on actual, real pretraining data, check out the recently added support for [fineweb dataset](https://huggingface.co/datasets/HuggingFaceFW/fineweb). Unlike the datasets above where the train/val tokens fit into a single .bin file, we now have multiple data shards as well. Here is an example:
 
-```bash
-pip install -r requirements.txt
-python dev/data/tinyshakespeare.py
-python train_gpt2.py
-make train_gpt2
-OMP_NUM_THREADS=8 ./train_gpt2
+```
+# write fineweb data in 100M token shards to dev/data/fineweb10B
+python dev/data/fineweb.py -s 100000000
+# compile and run
+./train_gpt2cu -i "dev/data/fineweb10B/fineweb_train_*.bin" \
+               -j "dev/data/fineweb10B/fineweb_val_*.bin" \
+               -v 250 -s 250 -g 144 -o fineweb.log -b 32
 ```
 
-The above lines (1) download the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset, tokenize it with the GPT-2 Tokenizer, (2) download and save the GPT-2 (124M) weights, (3) init from them in C and train for 40 steps on tineshakespeare with AdamW (using batch size 4, context length only 64), evaluate validation loss, and sample some text. Honestly, unless you have a beefy CPU (and can crank up the number of OMP threads in the launch command), you're not going to get that far on CPU training LLMs, but it might be a good demo/reference.
+Where you will notice the use of glob pattern `*` to match all the train shards.
 
 ## quick start (multiple GPUs)
 
-You'll be using the (more bleeding edge) mixed precision version of the code:
+Great, let's get even more serious. We're using MPI and NCCL for multi-GPU training. Everything in the section above applies, with the following changes:
 
 ```bash
+# example to install MPI:
 sudo apt install openmpi-bin openmpi-doc libopenmpi-dev
+# the run command is now preceeded by `mpirun`:
+mpirun -np <number of GPUs on your machine> ./train_gpt2cu
+```
+
+Sub in the number of GPUs you'd like to run on in the last command. All of the flags discussed in the section above apply here as well.
+
+## quick start (CPU)
+
+The "I am so GPU poor that I don't even have one" section. You can still train! But you won't go too far. You can still finetune a GPT-2 small (124M parameter model) to output Shakespeare-like text, as an example:
+
+```bash
 pip install -r requirements.txt
 python dev/data/tinyshakespeare.py
 python train_gpt2.py
-make train_gpt2cu
-mpirun -np <number of GPUs on your machine> ./train_gpt2cu
+make train_gpt2
+OMP_NUM_THREADS=8 ./train_gpt2
 ```
 
-Sub in the number of GPUs you'd like to run on in the last command.
+The above lines (1) download the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset, tokenize it with the GPT-2 Tokenizer, (2) download and save the GPT-2 (124M) weights, (3) init from them in C and train for 40 steps on tineshakespeare with AdamW (using batch size 4, context length only 64), evaluate validation loss, and sample some text. Honestly, unless you have a beefy CPU (and can crank up the number of OMP threads in the launch command), you're not going to get that far on CPU training LLMs, but it might be a good demo/reference.
 
 ## training: more detail
 
-Download and tokenize a dataset. The [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset is the fastest to download and tokenize:
+The data files inside `/dev/data/(dataset).py` are responsible for downloading, tokenizing and saving the tokens to file. So for example when you run:
 
 ```bash
 python dev/data/tinyshakespeare.py
 ```
 
-This prints:
+We download and tokenize the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset. The output of this looks like this:
 
 ```
-Saved 32768 tokens to (...)/tiny_shakespeare_val.bin
-Saved 305260 tokens to (...)/tiny_shakespeare_train.bin
+writing 32,768 tokens to ./dev/data/tinyshakespeare/tiny_shakespeare_val.bin
+writing 305,260 tokens to ./dev/data/tinyshakespeare/tiny_shakespeare_train.bin
 ```
 
-The .bin files are raw byte streams of int32 numbers indicating the token ids with the GPT-2 tokenizer. Alternatively you could also tokenize the [TinyStories](https://huggingface.co/datasets/roneneldan/TinyStories) dataset with `tinystories.py`.
+The .bin files contain a short header (1024 bytes) and then a stream of tokens in uint16, indicating the token ids with the GPT-2 tokenizer. More datasets are available in `/dev/data`.
 
-In principle we'd be ready to train the model right here. However the baseline CPU/fp32 reference code is so inefficient that it's not practical to train these models from scratch yet. Instead, we initialize with the GPT-2 weights released by OpenAI and just do finetuning. For that, we have to download the GPT-2 weights and save them as a checkpoint we can load in C:
+In principle, once we have the tokens, we'd be ready to train the model right here. However, current code can't start training from scratch just yet (coming very soon), so we initialize training from the pretrained models released by OpenAI and do finetuning. For that, we have to download the GPT-2 weights and save them as a checkpoint we can load in C. This is what happens when you run this script:
 
 ```bash
 python train_gpt2.py
 ```
 
-You'll recognize this code from nanoGPT as a simple GPT-2 reference implementation in PyTorch. This script will download the GPT-2 (124M) model, overfit a single batch of data for 10 iterations, run a few steps of generation, and most importantly it will save three files: 1) the `gpt2_124M.bin` file that contains the raw model weights for loading in C, 2) the `gpt2_124M_debug_state.bin`, which also contains more debug state: the inputs, targets, logits and loss (useful for debugging and unit testing), and finally 3) the `gpt2_tokenizer.bin` which stores the vocabulary for the GPT-2 tokenizer, translating token ids to byte sequences of UTF-8 encoded string pieces. We can now initialize with these model weights and continue training in raw C. First compile the code:
+You'll recognize this code from nanoGPT as a simple GPT-2 reference implementation in PyTorch. This script will download the GPT-2 (124M) model, overfit a single batch of data for 10 iterations, run a few steps of generation, and most importantly it will save three files: 1) the `gpt2_124M.bin` file that contains the raw model weights for loading in C, 2) the `gpt2_124M_debug_state.bin`, which also contains more debug state: the inputs, targets, logits and loss (useful for debugging and unit testing), and finally 3) the `gpt2_tokenizer.bin` which stores the vocabulary for the GPT-2 tokenizer, translating token ids to byte sequences of UTF-8 encoded string pieces. The file also saves both the fp32 versions of the above, and the bfloat16 versions of them for mixed precision training. We can now initialize with these model weights and continue training in raw C. Then we compile the training programs with `make`. There are currently three parallel implementations:
 
 ```bash
+# the simple, CPU, reference code version
 make train_gpt2
+# the single-GPU fp32 CUDA version
+make train_gpt2fp32cu
+# the multi-GPU mixed precision CUDA version
+make train_gpt2cu
 ```
 
-You can have a look inside the `Makefile` and its comments. It will try to autodetect if OpenMP is available on your system, which is very helpful for speeding up the code at very low cost of code complexity. Some people seem to experience problems compiling on Ubuntu, have a look at [Issue 19](https://github.com/karpathy/llm.c/issues/19), TLDR you'd want to modify the `CFLAGS`:
+You can have a look inside the `Makefile` and its comments. It will try to autodetect a lot of tools and libraries (e.g. cuDNN, OpenMP, OpenMPI, nvcc), and you want to get as many checkmarks as possible. For example when I run `make train_gpt2cu USE_CUDNN=1` on my fully configured machine, we see:
+
+```
+✓ cuDNN found, will run with flash-attention
+✓ OpenMP found
+✓ OpenMPI found, OK to train with multiple GPUs
+✓ nvcc found, including GPU/CUDA support
+```
+
+Some people seem to experience problems compiling on Ubuntu, have a look at [Issue 19](https://github.com/karpathy/llm.c/issues/19), TLDR you'd want to modify the `CFLAGS`:
 
 ```
 # try this first
@@ -122,7 +149,7 @@ CFLAGS="-Ofast -fno-finite-math-only -Wno-unused-result -march=native" make trai
 CFLAGS="-O3 -Wno-unused-result -march=native" make train_gpt2
 ```
 
-Once `train_gpt2` is compiled, you can run it:
+Once the binary is compiled, we can run it. For example the simplest CPU reference version runs as:
 
 ```bash
 OMP_NUM_THREADS=8 ./train_gpt2
@@ -164,18 +191,27 @@ Allay
 ---
 ```
 
-I like how Netflix comes up, it's clear that the shadow of the training past is still lurking in the model. I did not attempt to tune the finetuning hyperparameters so it's quite likely this can be improved quite a bit. I also noticed that slightly different platforms (e.g. MacOS / Linux) will (sadly) give very slightly different results, so perhaps don't expect to get the exact numbers or generation above. Also note that if you are seeing token ids instead of text in the generation, it might be because your code is out of date, as Tokenizer decoding was added April 14, 2024. `git pull` the updates, and then re-run `python train_gpt2.py`, which will now also save the tokenizer, which C can read and then use to print text instead of token ids.
+I like how Netflix comes up, it's clear that the shadow of the training past is still lurking in the model. I did not attempt to tune the finetuning hyperparameters so it's quite likely this can be improved quite a bit. I also noticed that slightly different platforms (e.g. MacOS / Linux) will (sadly) give very slightly different results, so perhaps don't expect to get the exact numbers or generation above.
+
+Finally, the code is in flux. If anything weird happens that you didn't expect or that worked previously, try to `git pull`, re-run all the commands above, reference back to this README, etc.
 
 ## test
 
-I am also attaching a simple unit test for making sure our C code agrees with the PyTorch code. Compile and run with:
+I am also attaching a simple unit test for making sure our C code agrees with the PyTorch code. On the CPU as an example, compile and run with:
 
 ```bash
 make test_gpt2
 ./test_gpt2
 ```
 
-This now loads the `gpt2_124M_debug_state.bin` file, runs a forward pass, compares the logits and loss with the PyTorch reference implementation, then it does 10 iterations of training with Adam and makes sure the losses match PyTorch.
+This now loads the `gpt2_124M_debug_state.bin` file, runs a forward pass, compares the logits and loss with the PyTorch reference implementation, then it does 10 iterations of training with Adam and makes sure the losses match PyTorch. To test the GPU version I run:
+
+```bash
+# fp32 test (cudnn not supported)
+make test_gpt2cu PRECISION=FP32 && ./test_gpt2cu
+# mixed precision cudnn test
+make test_gpt2cu USE_CUDNN=1 && ./test_gpt2cu
+```
 
 ## tutorial
 
@@ -183,7 +219,7 @@ I attached a very small tutorial here, in [doc/layernorm/layernorm.md](doc/layer
 
 ## CUDA
 
-The full training loop is also implemented in pure CUDA in one file, but optimizations of the kernels are ongoing. Currently, we roughly match the speed of PyTorch. The way we organize code is that we have a growing collection of kernels of increasing complexity in the `dev/cuda` folder, see [dev/cuda/README.md](dev/cuda/README.md). We then copy paste the best kernels into the main training loop in the single training file `train_gpt2cu.cu`.
+The full training loop is also implemented in pure CUDA in one file, but optimizations of the kernels are ongoing. Currently, we slightly exceed the speed of PyTorch Nightly. The way we organize code is that we have a growing collection of kernels of increasing complexity in the `dev/cuda` folder, see [dev/cuda/README.md](dev/cuda/README.md). We then copy paste the best kernels into the main training loop in the single training file `train_gpt2cu.cu`.
 
 **WIP alert, April 23**. We merged the first version of mixed precision training code. I checkpointed the fp32 version to separate files that include `_fp32` in their filename, and would like to preserve this version in the root of the repo because it 1) doesn't require the most up to date CUDA and will a lot more likely compile and is more portable, 2) it is a lot simpler and acts as reference. In fact, we'd like to diverge the fp32 version in the direction of being pure CUDA (e.g. do not even call cuBLAS by default), to be used as an educational reference, maybe even a kernel of a course on CUDA. The "mainline" development concerned with speed will from there on move to the [train_gpt2.cu](train_gpt2.cu) file, which includes mixed precision training.
 
@@ -198,7 +234,7 @@ make test_gpt2fp32cu
 
 This prints `overall okay: 1`. So the forward activations, backward gradients, and the individual loss values for 10 iterations all match exactly.
 
-**Training**. To train GPT-2 in a single file of CUDA, run the train script:
+**Training**. To train on single GPU in fp32:
 
 ```bash
 make train_gpt2fp32cu
@@ -228,9 +264,7 @@ For on his rock shall he be opencast.
 Keep on with me, my
 ```
 
-This runs on my A100 in about ~10 seconds. This training loop in the PyTorch script is about 80ms/iteration, so we are slightly better than PyTorch here. However, this is measured with PyTorch that is a bit stale (I'm on 2.1.0) and we're not yet including FlashAttention or the PyTorch scaled_dot_product_attention fused operation.
-
-We can compare to naive PyTorch like this, where we turn on `torch.compile` and the use of TensorCores, which use tf32 type:
+This runs on my A100 in about ~10 seconds. We can compare to naive PyTorch like this, where we turn on `torch.compile` and the use of TensorCores, which use tf32 type:
 
 ```bash
 python train_gpt2.py --write_tensors 0 --sequence_length 1024 --batch_size 4 --compile 1 --tensorcores 1

From 099d30f8140eca899d07e00c4708c69bc36b9261 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Wed, 22 May 2024 13:08:18 +0000
Subject: [PATCH 139/172] add a super small crappy glob for windows that only
 matches a single unique file. this will make CI happy but we can't train on
 sharded data on windows until this is improved

---
 dataloader.h | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/dataloader.h b/dataloader.h
index 93019317f..d04fc03ed 100644
--- a/dataloader.h
+++ b/dataloader.h
@@ -2,15 +2,44 @@
 Implements a medium simple DataLoader for a distributed training setup.
 */
 
-#include <glob.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
 #include <stdint.h>
+#include <assert.h>
+#include <string.h>
 // defines: fopenCheck, freadCheck, fcloseCheck, fseekCheck
 // defines: mallocCheck
 #include "utils.h"
 
+// ----------------------------------------------------------------------------
+// we need glob to list files matching a pattern
+// windows does not have glob, so we fall back on a very simple implementation
+// this implementation doesn't actually do a glob, it assumes that the "pattern"
+// is exactly the single file of interest
+#ifndef _WIN32
+#include <glob.h>
+#else
+
+typedef struct glob_t {
+    size_t gl_pathc;
+    char **gl_pathv;
+} glob_t;
+
+int glob(const char *pattern, int flags, void *unused, glob_t *pglob) {
+    assert(strstr(pattern, "*") == NULL); // we don't support * here
+    pglob->gl_pathc = 1;
+    pglob->gl_pathv = (char **)malloc(sizeof(char *));
+    if (pglob->gl_pathv == NULL) { exit(EXIT_FAILURE); } // ??? oom?
+    pglob->gl_pathv[0] = (char *)pattern;
+    return 0;
+}
+
+void globfree(glob_t* pglob) {
+    free(pglob->gl_pathv);
+}
+#endif
+
 // ----------------------------------------------------------------------------
 // Distributed Data Loader
 #define HEADER_SIZE 256

From 051f3ca53c4e0541a2a81ba09f7c0f96771ad9d6 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Wed, 22 May 2024 19:24:20 +0000
Subject: [PATCH 140/172] first draft, apparently this works. needs cleanups,
 and also we are not yet utilizing the full batch dimension. we actually have
 to load in multiple examples and fully utilize batch

---
 .gitignore              |   2 +-
 dataloader.h            | 170 ++++++++++++++++++++++++++++++++++++++++
 dev/data/data_common.py |  61 ++++++++++++++
 dev/data/hellaswag.py   |  33 ++++++--
 train_gpt2.cu           |  63 +++++++++++++--
 5 files changed, 317 insertions(+), 12 deletions(-)

diff --git a/.gitignore b/.gitignore
index 05391b6d1..4f6c4a0c7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,7 +7,7 @@
 
 # data directories
 dev/data/__pycache__/
-dev/data/fineweb/
+dev/data/fineweb10B/
 dev/data/hellaswag/
 dev/data/mmlu/
 dev/data/tinyshakespeare/
diff --git a/dataloader.h b/dataloader.h
index d04fc03ed..72055d8db 100644
--- a/dataloader.h
+++ b/dataloader.h
@@ -195,3 +195,173 @@ void dataloader_free(DataLoader *loader) {
     fcloseCheck(loader->tokens_file);
     globfree(&loader->glob_result);
 }
+
+// ----------------------------------------------------------------------------
+// Distributed Eval Loader
+// Many evals (like) HellaSwag and MMLU are multiple-choice
+// where there are 4 possible continuations and a label for the correct one
+// We want to load and serve these style of evals
+/*
+Copy pasting the section on the eval datafile format, from data_common.py:
+- First comes a header with 256 int32s
+- The examples follow, each example is a stream of uint16_t:
+    - <START_EXAMPLE> delimiter of 2**16-1, i.e. 65,535
+    - <EXAMPLE_BYTES>, bytes encoding this example, allowing efficient skip to next
+    - <EXAMPLE_INDEX>, the index of the example in the dataset
+    - <LABEL>, the index of the correct completion
+    - <NUM_COMPLETIONS>, indicating the number of completions (usually 4)
+    - <NUM><CONTEXT_TOKENS>, where <NUM> is the number of tokens in the context
+    - <NUM><COMPLETION_TOKENS>, repeated NUM_COMPLETIONS times
+*/
+
+typedef struct {
+    // variables related to distributed training
+    // each process/worker has to access different parts of the data
+    int process_rank;
+    int num_processes;
+    // hyperparameters. use size_t to prevent overflow
+    size_t B;
+    size_t T;
+    // input handling and its state
+    FILE* eval_file;
+    long file_size;
+    uint16_t* buffer; // we fread data from file into this buffer
+    // public variables that could be accessed from outside
+    int num_examples; // in total across all processes
+    int start_example_index; // the assignment of work for this process, start
+    int end_example_index; // and end. start is inclusive, end is exclusive
+    int* inputs;  // input tokens into transformer
+    int* targets; // target tokens for the transformer
+    char* mask; // mask=1 at all completion token locations
+    int label; // the correct completion label
+    int num_completions; // number of completions for this example
+} EvalLoader;
+
+void evalloader_reset(EvalLoader *loader) {
+    // we have to be careful that each process starts at the correct offset.
+    // For example if there are N examples in the file and 4 processes,
+    // then process 0 should start at 0, process 1 at N/4, process 2 at N/2, etc.
+    long header_bytes = HEADER_SIZE * sizeof(int);
+    // determine which example we want this process to start at
+    int process_stride = loader->num_examples / loader->num_processes;
+    loader->start_example_index = process_stride * loader->process_rank;
+    loader->end_example_index = process_stride * (loader->process_rank + 1);
+    if (loader->end_example_index > loader->num_examples) {
+        loader->end_example_index = loader->num_examples;
+    }
+    // now seek through the file to the start of that example
+    // utilize <EXAMPLE_BYTES> for efficiency
+    fseekCheck(loader->eval_file, header_bytes, SEEK_SET);
+    for (int i = 0; i < loader->start_example_index; i++) {
+        uint16_t example_header[3];
+        // read 3 uint16_t values: <START_EXAMPLE>, <EXAMPLE_BYTES>, <EXAMPLE_INDEX>
+        freadCheck(&example_header[0], sizeof(uint16_t), 3, loader->eval_file);
+        // validate the <START_EXAMPLE> delimiter
+        assert(example_header[0] == 65535); // <START_EXAMPLE> delimiter
+        // validate the <EXAMPLE_INDEX>
+        assert(example_header[2] == i); // <EXAMPLE_INDEX> should match the loop index
+        // skip to the next example, keeping in mind that we already read the header
+        size_t remaining_bytes = example_header[1] - sizeof(uint16_t) * 3;
+        assert(remaining_bytes > 0); // we expect some bytes in the example
+        fseekCheck(loader->eval_file, remaining_bytes, SEEK_CUR);
+    }
+    // now we are at the start of the example we want to start at, pointing at <START_EXAMPLE>
+}
+
+void evalloader_init(EvalLoader *loader,
+                     const char* filename,
+                     size_t B,
+                     size_t T,
+                     int process_rank,
+                     int num_processes) {
+    loader->process_rank = process_rank;
+    loader->num_processes = num_processes;
+    loader->B = B;
+    loader->T = T;
+
+    // open the file and validate the header
+    loader->eval_file = fopenCheck(filename, "rb");
+    // validate the header
+    int header[HEADER_SIZE];
+    freadCheck(header, sizeof(int), HEADER_SIZE, loader->eval_file);
+    if (header[0] != 20240522) { printf("Bad magic in eval file\n"); exit(EXIT_FAILURE); }
+    if (header[1] != 1) { printf("Bad version in data file\n"); exit(EXIT_FAILURE); }
+    loader->num_examples = header[2]; // number of tokens in the file
+    assert(loader->num_examples >= num_processes); // avoid headaches for now
+    size_t longest_example_bytes = header[3]; // longest example in the file
+    // basic sensibility check we could relax later. but roughly it's mostly
+    // the prompt/context and 4 completions, 2 bytes/token, so the longest example
+    // should be well below 5 times the context length or so (approx. napkin math)
+    assert(longest_example_bytes > 0 && longest_example_bytes < 5*T*2);
+
+    // allocate all the space we'll need
+    loader->buffer = (uint16_t*)malloc(longest_example_bytes);
+    loader->inputs = (int*)malloc(B * T * sizeof(int));
+    loader->targets = (int*)malloc(B * T * sizeof(int));
+    loader->mask = (char*)malloc(B * T * sizeof(char));
+    loader->label = -1; // initialize the label to an invalid value
+
+    // reset the loader, to initialize it
+    evalloader_reset(loader);
+}
+
+void evalloader_next_batch(EvalLoader *loader) {
+    // this function populates the inputs, targets, mask, and label fields
+    size_t B = loader->B;
+    size_t T = loader->T;
+    // read the current example header
+    uint16_t example_header[3];
+    freadCheck(&example_header[0], sizeof(uint16_t), 3, loader->eval_file);
+    // validate the <START_EXAMPLE> delimiter
+    assert(example_header[0] == 65535); // <START_EXAMPLE> delimiter
+    // validate the <EXAMPLE_INDEX>
+    assert(example_header[2] >= loader->start_example_index && example_header[2] < loader->end_example_index);
+    // read the rest of the example (we have space for 3 more uint16_t values in buffer, it's ok)
+    size_t example_bytes = example_header[1] - sizeof(uint16_t) * 3;
+    // read example_bytes into buffer. careful that this is actually in the units of bytes
+    freadCheck(loader->buffer, sizeof(char), example_bytes, loader->eval_file);
+    // process the example label
+    int label = (int)loader->buffer[0];
+    assert(label >= 0 && label < 4); // we expect the label to be in [0, 4) for right now
+    loader->label = label; // store for output
+    // process the number of completions
+    int num_completions = (int)loader->buffer[1];
+    assert(num_completions == 4); // we expect 4 completions for now
+    loader->num_completions = num_completions; // store for output
+    // init all inputs, targets, mask to zeros
+    memset(loader->inputs, 0, B * T * sizeof(int));
+    memset(loader->targets, 0, B * T * sizeof(int));
+    memset(loader->mask, 0, B * T * sizeof(char));
+    // process the context
+    // the context is shared for all completions, so we insert it into all data rows equally
+    int context_length = (int)loader->buffer[2];
+    uint16_t *context_tokens_start = &loader->buffer[3]; // where the tokens start
+    assert(context_length > 0 && context_length < T); // context is non-empty and up to T
+    for (int b = 0; b < num_completions; b++) {
+        for (int i = 0; i < context_length; i++) {
+            int tok_cur = (int)context_tokens_start[i];
+            loader->inputs[b * T + i] = tok_cur;
+        }
+    }
+    // process the completions, insert them in their row, right after the (shared) context
+    uint16_t *completions_iter = loader->buffer + 3 + context_length;
+    for (int c = 0; c < num_completions; c++) {
+        int completion_length = (int)completions_iter[0];
+        uint16_t *completion_tokens_start = completions_iter + 1;
+        assert(completion_length > 0 && context_length + completion_length < T); // things fit?
+        for (int i = 0; i < completion_length; i++) {
+            int tok_cur = (int)completion_tokens_start[i];
+            // at inputs, the completions simply follow the context
+            loader->inputs[c * T + context_length + i] = tok_cur;
+            // at targets things start to get tricky
+            // we expect the last context token to predict the first completion token
+            // and then onwards from there.
+            loader->targets[c * T + context_length + i - 1] = tok_cur;
+            // and at these positions, we want to set mask=1, because these are the
+            // positions where we want to average the loss, in each row, to determine
+            // its overall probability of following the context.
+            loader->mask[c * T + context_length + i - 1] = 1;
+        }
+        completions_iter += 1 + completion_length; // move to the next completion
+    }
+}
diff --git a/dev/data/data_common.py b/dev/data/data_common.py
index 8bae1274d..8b8d43756 100644
--- a/dev/data/data_common.py
+++ b/dev/data/data_common.py
@@ -45,3 +45,64 @@ def write_datafile(filename, toks):
     with open(filename, "wb") as f:
         f.write(header.tobytes())
         f.write(toks_np.tobytes())
+
+def write_evalfile(filename, datas):
+    """
+    Saves eval data as a .bin file, for reading in C.
+    Used for multiple-choice style evals, e.g. HellaSwag and MMLU
+    - First comes a header with 256 int32s
+    - The examples follow, each example is a stream of uint16_t:
+        - <START_EXAMPLE> delimiter of 2**16-1, i.e. 65,535
+        - <EXAMPLE_BYTES>, bytes encoding this example, allowing efficient skip to next
+        - <EXAMPLE_INDEX>, the index of the example in the dataset
+        - <LABEL>, the index of the correct completion
+        - <NUM_COMPLETIONS>, indicating the number of completions (usually 4)
+        - <NUM><CONTEXT_TOKENS>, where <NUM> is the number of tokens in the context
+        - <NUM><COMPLETION_TOKENS>, repeated NUM_COMPLETIONS times
+    """
+    # construct the header
+    header = np.zeros(256, dtype=np.int32)
+    header[0] = 20240522 # magic
+    header[1] = 1 # version
+    header[2] = len(datas) # number of examples
+    header[3] = 0 # reserved for longest_example_bytes, fill in later
+    # now write the individual examples
+    longest_example_bytes = 0 # in units of uint16s
+    full_stream = [] # the stream of uint16s, we'll write a single time at the end
+    assert len(datas) < 2**16, "too many examples?"
+    for idx, data in enumerate(datas):
+        stream = []
+        # header of the example
+        stream.append(2**16-1) # <START_EXAMPLE>
+        stream.append(0) # <EXAMPLE_BYTES> (fill in later)
+        stream.append(idx) # <EXAMPLE_INDEX>
+        stream.append(data["label"]) # <LABEL>
+        ending_tokens = data["ending_tokens"]
+        assert len(ending_tokens) == 4, "expected 4 completions for now? can relax later"
+        stream.append(len(ending_tokens)) # <NUM_COMPLETIONS>
+        # the (shared) context tokens
+        ctx_tokens = data["ctx_tokens"]
+        assert all(0 <= t < 2**16-1 for t in ctx_tokens), "bad context token"
+        stream.append(len(ctx_tokens))
+        stream.extend(ctx_tokens)
+        # the completion tokens
+        for end_tokens in ending_tokens:
+            assert all(0 <= t < 2**16-1 for t in end_tokens), "bad completion token"
+            stream.append(len(end_tokens))
+            stream.extend(end_tokens)
+        # write to full stream
+        nbytes = len(stream)*2 # 2 bytes per uint16
+        assert nbytes < 2**16, "example too large?"
+        stream[1] = nbytes # fill in the <EXAMPLE_BYTES> field
+        longest_example_bytes = max(longest_example_bytes, nbytes)
+        full_stream.extend(stream)
+    # construct the numpy array
+    stream_np = np.array(full_stream, dtype=np.uint16)
+    # fill in the longest_example field
+    assert 0 < longest_example_bytes < 2**16, f"bad longest_example"
+    header[3] = longest_example_bytes
+    # write to file (for HellaSwag val this is 10,042 examples, 3.6MB file)
+    print(f"writing {len(datas):,} examples to {filename}")
+    with open(filename, "wb") as f:
+        f.write(header.tobytes())
+        f.write(stream_np.tobytes())
diff --git a/dev/data/hellaswag.py b/dev/data/hellaswag.py
index a1c14f591..819ce6a75 100644
--- a/dev/data/hellaswag.py
+++ b/dev/data/hellaswag.py
@@ -1,6 +1,7 @@
 """
 Downloads and evaluates HellaSwag in Python.
 This then acts as the reference file for llm.c
+Also writes the data (tokens, labels) to .bin files for parallel evaluation in C.
 https://github.com/rowanz/hellaswag
 
 Example HellaSwag json item:
@@ -22,6 +23,8 @@
 gpt2-xl (1558M)
 - eleuther harness reports acc 40.04%, acc_norm 50.89% (multiple choice style)
 - this script: 10042 acc: 0.3842 acc_norm: 0.4893 (completion style)
+
+The validation set of HellaSwag has a total of 10,042 examples.
 """
 
 import os
@@ -33,7 +36,7 @@
 import torch.nn as nn
 from torch.nn import functional as F
 from transformers import GPT2LMHeadModel
-from data_common import download_file
+from data_common import download_file, write_evalfile
 
 # -----------------------------------------------------------------------------
 DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "hellaswag")
@@ -68,14 +71,23 @@ def render_example(example):
     label = example["label"]
     endings = example["endings"]
 
+    # data needed to reproduce this eval on the C size
+    data = {
+        "label": label,
+        "ctx_tokens": None,
+        "ending_tokens": [],
+    }
+
     # gather up all the tokens
     ctx_tokens = enc.encode(ctx)
+    data["ctx_tokens"] = ctx_tokens
     tok_rows = []
     mask_rows = []
     for end in endings:
         end_tokens = enc.encode(" " + end) # note: prepending " " because GPT-2 tokenizer
         tok_rows.append(ctx_tokens + end_tokens)
         mask_rows.append([0]*len(ctx_tokens) + [1]*len(end_tokens))
+        data["ending_tokens"].append(end_tokens)
 
     # have to be careful during the collation because the number of tokens in each row can differ
     max_len = max(len(row) for row in tok_rows)
@@ -85,17 +97,22 @@ def render_example(example):
         tokens[i, :len(tok_row)] = torch.tensor(tok_row)
         mask[i, :len(mask_row)] = torch.tensor(mask_row)
 
-    return tokens, mask, label
+    return data, tokens, mask, label
 
 def iterate_examples(split):
     # there are 10,042 examples in total in val
-
+    n = 0
     download(split)
     with open(os.path.join(DATA_CACHE_DIR, f"hellaswag_{split}.jsonl"), "r") as f:
         for line in f:
             example = json.loads(line)
+            n += 1
             yield example
 
+            # DEBUGGING, TODO REMOVE
+            if n >= 100:
+                break
+
 @torch.no_grad()
 def evaluate(model_type, device):
 
@@ -105,11 +122,13 @@ def evaluate(model_type, device):
     model.to(device)
     # model = torch.compile(model)
 
+    datas = []
     num_correct_norm = 0
     num_correct = 0
     num_total = 0
     for example in iterate_examples("val"):
-        tokens, mask, label = render_example(example)
+        data, tokens, mask, label = render_example(example)
+        datas.append(data)
         tokens = tokens.to(device)
         mask = mask.to(device)
 
@@ -146,7 +165,11 @@ def evaluate(model_type, device):
             print(f"Endings:")
             for i, end in enumerate(example["endings"]):
                 print(f"{i} (loss: {avg_loss[i].item():.4f}) {end}")
-            print(f"predicted: {pred}, actual: {label}")
+            print(f"predicted: {pred_norm}, actual: {label}")
+
+    # now write the data to a .bin file
+    filename = os.path.join(DATA_CACHE_DIR, f"hellaswag_val.bin")
+    write_evalfile(filename, datas)
 
 if __name__ == "__main__":
     import argparse
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 578e7c80d..13605fd77 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2337,13 +2337,13 @@ void gpt2_backward(GPT2 *model) {
     encoder_backward(grads.wte, grads.wpe, dresidual, model->inputs, B, T, C, random_u32(&model->rng_state));
 }
 
-// Compute a mean of a single CPU value across all GPU processes. No-op when multi-GPU is disabled.
-float multi_gpu_cpu_float_mean(float value, const MultiGpuConfig* multi_gpu_config) {
+// Compute sum of a single CPU value across all GPU processes. No-op when multi-GPU is disabled.
+float multi_gpu_cpu_float_sum(float value) {
 #ifdef MULTI_GPU
-    // MPI doesn't support all reduce with mean, so we sum up, then divide.
+    // note MPI doesn't support all reduce with mean, only sum
     float result;
     mpiCheck(MPI_Allreduce(&value, &result, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD));
-    return result / multi_gpu_config->num_processes;
+    return result;
 #else
     return value;
 #endif
@@ -2356,7 +2356,7 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) {
     NVTX_RANGE_FN();
     if (multi_gpu_config->num_processes == 1) { return; }
     // Average all losses.
-    model->accumulated_mean_loss = multi_gpu_cpu_float_mean(model->mean_loss, multi_gpu_config);
+    model->accumulated_mean_loss = multi_gpu_cpu_float_sum(model->mean_loss) / multi_gpu_config->num_processes;
     if(multi_gpu_config->zero_stage == 0) {
         //  no ZERO == standard DDP: Average all gradients.
         ncclCheck(ncclAllReduce(model->grads_memory, model->grads_memory,
@@ -2520,6 +2520,12 @@ void logger_init(Logger *logger, const char *filename) {
     if (filename != NULL) { logger->logfile = fopenCheck(filename, "w"); }
 }
 
+void logger_log_eval(Logger *logger, int step, float val_loss) {
+    if (logger->logfile != NULL) {
+        fprintf(logger->logfile, "s:%d eval:%.4f\n", step, val_loss);
+    }
+}
+
 void logger_log_val(Logger *logger, int step, float val_loss) {
     if (logger->logfile != NULL) {
         fprintf(logger->logfile, "s:%d tel:%.4f\n", step, val_loss);
@@ -2676,6 +2682,11 @@ int main(int argc, char *argv[]) {
     printf0("| val_num_batches       | %-50d |\n", val_num_batches);
     printf0("+-----------------------+----------------------------------------------------+\n");
 
+    // build an EvalLoader for HellaSwag
+    EvalLoader eval_loader;
+    const char* hellaswag_path = "dev/data/hellaswag/hellaswag_val.bin";
+    evalloader_init(&eval_loader, hellaswag_path, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes);
+
     // pretty print in a table the multi-gpu configuration as well
     set_zero_configs(&multi_gpu_config, zero_stage, model.num_parameters);
     printf0("| num_processes         | %-50d |\n", multi_gpu_config.num_processes);
@@ -2729,11 +2740,51 @@ int main(int argc, char *argv[]) {
                 val_loss += model.mean_loss;
             }
             val_loss /= val_num_batches;
-            val_loss = multi_gpu_cpu_float_mean(val_loss, &multi_gpu_config);
+            val_loss = multi_gpu_cpu_float_sum(val_loss) / multi_gpu_config.num_processes;
             printf0("val loss %f\n", val_loss);
             logger_log_val(&logger, step, val_loss);
         }
 
+        // once in a while estimate HellaSwag accuracy
+        if (step % val_loss_every == 0 || last_step) {
+            NvtxRange evaluation_range("evaluation");
+            float eval_acc_norm = 0.0f;
+            evalloader_reset(&eval_loader);
+            int eval_num_batches = eval_loader.end_example_index - eval_loader.start_example_index;
+            for (int i = 0; i < eval_num_batches; i++) {
+                evalloader_next_batch(&eval_loader);
+                gpt2_forward(&model, eval_loader.inputs, eval_loader.targets, B, T);
+                // so at this stage we have model->cpu_losses giving the loss at each (b,t) position
+                // now we want to iterate in each row (b=0..3) and average the loss at mask=1 positions
+                float min_loss;
+                int min_loss_index;
+                for (int b = 0; b < eval_loader.num_completions; b++) {
+                    float average_loss = 0.0f;
+                    int count = 0;
+                    for (int t = 0; t < T; t++) {
+                        char mask = eval_loader.mask[b * T + t];
+                        if (mask == 1) {
+                            average_loss += (float)model.cpu_losses[b * T + t];
+                            count++;
+                        }
+                    }
+                    if (count > 0) { average_loss /= count; }
+                    if (b == 0 || average_loss < min_loss) {
+                        min_loss = average_loss;
+                        min_loss_index = b;
+                    }
+                }
+                if (min_loss_index == eval_loader.label) {
+                    eval_acc_norm += 1.0f;
+                }
+            }
+            // careful because not all ranks may have the exact same allocation of number of examples
+            eval_acc_norm = multi_gpu_cpu_float_sum(eval_acc_norm);
+            eval_acc_norm /= eval_loader.num_examples;
+            printf0("HellaSwag: %f\n", eval_acc_norm);
+            logger_log_eval(&logger, step, eval_acc_norm);
+        }
+
         // once in a while do model inference to print generated text
         if (multi_gpu_config.process_rank == 0 && (step > 0 && (step % sample_every) == 0 || last_step)) {
             NvtxRange generation_range("generation");

From 67239d9b8fab3c96e9ed548b7e39d92a50ad9dbe Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Wed, 22 May 2024 22:21:53 +0000
Subject: [PATCH 141/172] fixed a bunch of small issues and we now utilize the
 full batch dimension, this is very close to ready to merge, might even be
 ready

---
 dataloader.h          | 148 ++++++++++++++++++++++++++++++++++--------
 dev/data/hellaswag.py |   5 +-
 train_gpt2.cu         |  44 +++++--------
 3 files changed, 139 insertions(+), 58 deletions(-)

diff --git a/dataloader.h b/dataloader.h
index 72055d8db..cd38fe343 100644
--- a/dataloader.h
+++ b/dataloader.h
@@ -214,26 +214,33 @@ Copy pasting the section on the eval datafile format, from data_common.py:
     - <NUM><COMPLETION_TOKENS>, repeated NUM_COMPLETIONS times
 */
 
+// for now, could relax later
+#define ASSUMED_NUM_COMPLETIONS 4
+// helper macro for ceildiv
+#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
+
 typedef struct {
     // variables related to distributed training
     // each process/worker has to access different parts of the data
     int process_rank;
     int num_processes;
     // hyperparameters. use size_t to prevent overflow
-    size_t B;
-    size_t T;
+    size_t B; // (micro) batch size dimension of the tensor that feeds into the model
+    size_t T; // maximum context length of the model
     // input handling and its state
     FILE* eval_file;
     long file_size;
     uint16_t* buffer; // we fread data from file into this buffer
     // public variables that could be accessed from outside
     int num_examples; // in total across all processes
+    int num_batches; // to process the entire dataset across all processes
     int start_example_index; // the assignment of work for this process, start
     int end_example_index; // and end. start is inclusive, end is exclusive
+    int current_example_index; // the next example we would read
     int* inputs;  // input tokens into transformer
     int* targets; // target tokens for the transformer
     char* mask; // mask=1 at all completion token locations
-    int label; // the correct completion label
+    int* label; // the correct completion labels
     int num_completions; // number of completions for this example
 } EvalLoader;
 
@@ -241,16 +248,20 @@ void evalloader_reset(EvalLoader *loader) {
     // we have to be careful that each process starts at the correct offset.
     // For example if there are N examples in the file and 4 processes,
     // then process 0 should start at 0, process 1 at N/4, process 2 at N/2, etc.
-    long header_bytes = HEADER_SIZE * sizeof(int);
-    // determine which example we want this process to start at
-    int process_stride = loader->num_examples / loader->num_processes;
-    loader->start_example_index = process_stride * loader->process_rank;
-    loader->end_example_index = process_stride * (loader->process_rank + 1);
+    // determine how much work there is for all processes
+    int examples_per_process = CEIL_DIV(loader->num_examples, loader->num_processes);
+    int can_fit_examples = loader->B / ASSUMED_NUM_COMPLETIONS;
+    loader->num_batches = CEIL_DIV(examples_per_process, can_fit_examples);
+    // determine the start and end example indices for this process
+    loader->start_example_index = examples_per_process * loader->process_rank;
+    loader->end_example_index = examples_per_process * (loader->process_rank + 1);
+    // crop the end example index to the total number of examples
     if (loader->end_example_index > loader->num_examples) {
         loader->end_example_index = loader->num_examples;
     }
     // now seek through the file to the start of that example
     // utilize <EXAMPLE_BYTES> for efficiency
+    long header_bytes = HEADER_SIZE * sizeof(int);
     fseekCheck(loader->eval_file, header_bytes, SEEK_SET);
     for (int i = 0; i < loader->start_example_index; i++) {
         uint16_t example_header[3];
@@ -266,6 +277,7 @@ void evalloader_reset(EvalLoader *loader) {
         fseekCheck(loader->eval_file, remaining_bytes, SEEK_CUR);
     }
     // now we are at the start of the example we want to start at, pointing at <START_EXAMPLE>
+    loader->current_example_index = loader->start_example_index;
 }
 
 void evalloader_init(EvalLoader *loader,
@@ -289,32 +301,40 @@ void evalloader_init(EvalLoader *loader,
     loader->num_examples = header[2]; // number of tokens in the file
     assert(loader->num_examples >= num_processes); // avoid headaches for now
     size_t longest_example_bytes = header[3]; // longest example in the file
-    // basic sensibility check we could relax later. but roughly it's mostly
-    // the prompt/context and 4 completions, 2 bytes/token, so the longest example
-    // should be well below 5 times the context length or so (approx. napkin math)
-    assert(longest_example_bytes > 0 && longest_example_bytes < 5*T*2);
+    // basic sensibility check we could relax later. but roughly each example
+    // contains the prompt (or "context") and 4 completions, all of these have to be
+    // up to T tokens, and their tokens are uint16_t (so 2 bytes/token).
+    // There's a few more things in each example but they are minor.
+    // So longest example should be roughly this. Just trying to make sure it's sensible.
+    assert(longest_example_bytes > 0 && longest_example_bytes < (1+ASSUMED_NUM_COMPLETIONS)*T*2);
 
     // allocate all the space we'll need
+    int can_fit_examples = B / ASSUMED_NUM_COMPLETIONS;
     loader->buffer = (uint16_t*)malloc(longest_example_bytes);
     loader->inputs = (int*)malloc(B * T * sizeof(int));
     loader->targets = (int*)malloc(B * T * sizeof(int));
     loader->mask = (char*)malloc(B * T * sizeof(char));
-    loader->label = -1; // initialize the label to an invalid value
+    loader->label = (int*)malloc(can_fit_examples * sizeof(int));
 
     // reset the loader, to initialize it
     evalloader_reset(loader);
 }
 
-void evalloader_next_batch(EvalLoader *loader) {
-    // this function populates the inputs, targets, mask, and label fields
+void evalloader_next_example_(EvalLoader *loader, int example_batch_index) {
+    // this function populates the inputs, targets, mask, and label fields for one example
+    // because every (B,T) tensor can fit multiple examples and we want to take advantage,
+    // we also pass in the example_batch_index to indicate which example in the batch we are loading
+    // and each example takes up ASSUMED_NUM_COMPLETIONS rows in the batch
     size_t B = loader->B;
     size_t T = loader->T;
+    int batch_dim_offset = example_batch_index * ASSUMED_NUM_COMPLETIONS;
     // read the current example header
     uint16_t example_header[3];
     freadCheck(&example_header[0], sizeof(uint16_t), 3, loader->eval_file);
     // validate the <START_EXAMPLE> delimiter
     assert(example_header[0] == 65535); // <START_EXAMPLE> delimiter
     // validate the <EXAMPLE_INDEX>
+    assert(example_header[2] == loader->current_example_index); // <EXAMPLE_INDEX> should match the loop index
     assert(example_header[2] >= loader->start_example_index && example_header[2] < loader->end_example_index);
     // read the rest of the example (we have space for 3 more uint16_t values in buffer, it's ok)
     size_t example_bytes = example_header[1] - sizeof(uint16_t) * 3;
@@ -322,16 +342,15 @@ void evalloader_next_batch(EvalLoader *loader) {
     freadCheck(loader->buffer, sizeof(char), example_bytes, loader->eval_file);
     // process the example label
     int label = (int)loader->buffer[0];
-    assert(label >= 0 && label < 4); // we expect the label to be in [0, 4) for right now
-    loader->label = label; // store for output
+    int can_fit_examples = loader->B / ASSUMED_NUM_COMPLETIONS;
+    assert(label >= 0 && label < ASSUMED_NUM_COMPLETIONS); // we expect the label to be in [0, 4) for right now
+    assert(example_batch_index >= 0 && example_batch_index < can_fit_examples);
+    loader->label[example_batch_index] = label; // store for output
     // process the number of completions
     int num_completions = (int)loader->buffer[1];
-    assert(num_completions == 4); // we expect 4 completions for now
+    assert(num_completions == ASSUMED_NUM_COMPLETIONS); // we expect 4 completions for now
+    assert(batch_dim_offset + num_completions <= B); // we expect to fit in the batch
     loader->num_completions = num_completions; // store for output
-    // init all inputs, targets, mask to zeros
-    memset(loader->inputs, 0, B * T * sizeof(int));
-    memset(loader->targets, 0, B * T * sizeof(int));
-    memset(loader->mask, 0, B * T * sizeof(char));
     // process the context
     // the context is shared for all completions, so we insert it into all data rows equally
     int context_length = (int)loader->buffer[2];
@@ -339,29 +358,106 @@ void evalloader_next_batch(EvalLoader *loader) {
     assert(context_length > 0 && context_length < T); // context is non-empty and up to T
     for (int b = 0; b < num_completions; b++) {
         for (int i = 0; i < context_length; i++) {
+            int boff = batch_dim_offset + b;
             int tok_cur = (int)context_tokens_start[i];
-            loader->inputs[b * T + i] = tok_cur;
+            loader->inputs[boff * T + i] = tok_cur;
         }
     }
     // process the completions, insert them in their row, right after the (shared) context
     uint16_t *completions_iter = loader->buffer + 3 + context_length;
     for (int c = 0; c < num_completions; c++) {
+        int coff = batch_dim_offset + c;
         int completion_length = (int)completions_iter[0];
         uint16_t *completion_tokens_start = completions_iter + 1;
         assert(completion_length > 0 && context_length + completion_length < T); // things fit?
         for (int i = 0; i < completion_length; i++) {
             int tok_cur = (int)completion_tokens_start[i];
             // at inputs, the completions simply follow the context
-            loader->inputs[c * T + context_length + i] = tok_cur;
+            loader->inputs[coff * T + context_length + i] = tok_cur;
             // at targets things start to get tricky
             // we expect the last context token to predict the first completion token
             // and then onwards from there.
-            loader->targets[c * T + context_length + i - 1] = tok_cur;
+            loader->targets[coff * T + context_length + i - 1] = tok_cur;
             // and at these positions, we want to set mask=1, because these are the
             // positions where we want to average the loss, in each row, to determine
             // its overall probability of following the context.
-            loader->mask[c * T + context_length + i - 1] = 1;
+            loader->mask[coff * T + context_length + i - 1] = 1;
         }
         completions_iter += 1 + completion_length; // move to the next completion
     }
+    // advance the current example to point to the next one we'd load
+    loader->current_example_index += 1;
+}
+
+void evalloader_next_batch(EvalLoader *loader) {
+    size_t B = loader->B;
+    size_t T = loader->T;
+    // init all inputs, targets, mask to zeros
+    // TODO: I think only mask is necessary to reset?
+    memset(loader->inputs, 0, B * T * sizeof(int));
+    memset(loader->targets, 0, B * T * sizeof(int));
+    memset(loader->mask, 0, B * T * sizeof(char));
+    // ok here is the problem we are solving
+    // we have a batch dimension of B, which we want to take full advantage of
+    // each example has some number of completions (usually 4)
+    // so we want to pack as many examples into rows of B as we can fit
+    int can_fit_examples = B / ASSUMED_NUM_COMPLETIONS; // how many examples can we fit in the batch?
+    for (int i = 0; i < can_fit_examples; i++) {
+        if (loader->current_example_index >= loader->end_example_index) {
+            break; // this process has exhausted its work, noop from here on
+        }
+        evalloader_next_example_(loader, i);
+    }
+}
+
+int evalloader_stat_losses(EvalLoader *loader, float* losses) {
+    // compute statistics of losses (B*T) resulting from a forward pass
+    // on a batch that was constructed from EvalLoader
+    // putting this functionality here because it is tightly coupled
+    // with how we construct and represent the data batches.
+    // returns the number of correct examples in this batch.
+    int correct = 0;
+    size_t B = loader->B;
+    size_t T = loader->T;
+    // iterate the examples in this batch
+    int can_fit_examples = B / ASSUMED_NUM_COMPLETIONS;
+    for (int i = 0; i < can_fit_examples; i++) {
+        float min_loss;
+        int min_loss_index;
+        char active = 0; // is this example active or fully empty?
+        // iterate the completions in this example
+        for (int b = 0; b < ASSUMED_NUM_COMPLETIONS; b++) {
+            int boff = i * ASSUMED_NUM_COMPLETIONS + b;
+            // evaluate the quality of this completion
+            // its quality is simply the average loss over the tokens
+            float average_loss = 0.0f;
+            int count = 0;
+            for (int t = 0; t < T; t++) {
+                char mask = loader->mask[boff * T + t];
+                if (mask == 1) {
+                    active = 1;
+                    average_loss += losses[boff * T + t];
+                    count++;
+                }
+            }
+            if (count > 0) { average_loss /= count; }
+            if (b == 0 || average_loss < min_loss) {
+                min_loss = average_loss;
+                min_loss_index = b;
+            }
+        }
+        if (active && (min_loss_index == loader->label[i])) {
+            correct += 1;
+        }
+    }
+    return correct;
 }
+
+void evalloader_free(EvalLoader *loader) {
+    free(loader->buffer);
+    free(loader->inputs);
+    free(loader->targets);
+    free(loader->mask);
+    free(loader->label);
+    fcloseCheck(loader->eval_file);
+}
\ No newline at end of file
diff --git a/dev/data/hellaswag.py b/dev/data/hellaswag.py
index 819ce6a75..464016674 100644
--- a/dev/data/hellaswag.py
+++ b/dev/data/hellaswag.py
@@ -108,9 +108,8 @@ def iterate_examples(split):
             example = json.loads(line)
             n += 1
             yield example
-
             # DEBUGGING, TODO REMOVE
-            if n >= 100:
+            if n >= 101:
                 break
 
 @torch.no_grad()
@@ -156,7 +155,7 @@ def evaluate(model_type, device):
         num_total += 1
         num_correct += int(pred == label)
         num_correct_norm += int(pred_norm == label)
-        print(f"{num_total} acc: {num_correct/num_total:.4f} acc_norm: {num_correct_norm/num_total:.4f}")
+        print(f"{num_total} acc: {num_correct/num_total:.4f} acc_norm: {num_correct_norm}/{num_total}={num_correct_norm/num_total:.4f}")
 
         # debug: pretty print a few examples, and the losses in each case
         if num_total < 10:
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 13605fd77..11de22f10 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -55,6 +55,7 @@ This reads & runs in fp32, B=4, T=64, LR=1e-4, val/sample never (200),
 // defines: tokenizer_init, tokenizer_decode, tokenizer_free
 #include "tokenizer.h"
 // defines: dataloader_init, dataloader_reset, dataloader_next_batch, dataloader_free
+// defines: evalloader_init, evalloader_reset, evalloader_next_batch, evalloader_free
 #include "dataloader.h"
 
 // ----------------------------------------------------------------------------
@@ -1946,6 +1947,7 @@ typedef struct {
     float mean_loss; // after a forward pass with targets, will be populated with the mean loss
     float accumulated_mean_loss; // Mean loss after aggregating it on all GPUs
     floatX* cpu_losses; // CPU buffer to copy the losses to, allocated with cudaMallocHost
+    float* cpu_losses_fp32; // same but fp32
     unsigned long long rng_state; // the RNG state for seeding stochastic rounding etc.
     int use_master_weights;
     int recompute;
@@ -2024,6 +2026,7 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
     model->inputs = NULL;
     model->targets = NULL;
     model->cpu_losses = NULL;
+    model->cpu_losses_fp32 = NULL;
     model->batch_size = 0;
     model->seq_len = 0;
     model->mean_loss = -1.0f; // -1.0f will designate no loss
@@ -2077,6 +2080,7 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, in
         cudaCheck(cudaMalloc((void**)&model->inputs, B * T * sizeof(int)));
         cudaCheck(cudaMalloc((void**)&model->targets, B * T * sizeof(int)));
         cudaCheck(cudaMallocHost((void**)&model->cpu_losses, B * T * sizeof(floatX)));
+        cudaCheck(cudaMallocHost((void**)&model->cpu_losses_fp32, B * T * sizeof(float)));
     } else {
         // validate B,T is consistent with how we've allocated the memory before
         // in principle we could get more clever here in the future, for now this is safest
@@ -2181,7 +2185,11 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, in
         // for convenience also evaluate the mean loss (TODO re-think this compute+sync point)
         cudaCheck(cudaMemcpy(model->cpu_losses, acts.losses, B * T * sizeof(floatX), cudaMemcpyDeviceToHost));
         float mean_loss = 0.0f;
-        for (int i = 0; i < B*T; i++) { mean_loss += (float)(model->cpu_losses[i]); }
+        for (int i = 0; i < B*T; i++) {
+            float loss = (float)(model->cpu_losses[i]);
+            model->cpu_losses_fp32[i] = loss;
+            mean_loss += loss;
+        }
         mean_loss /= B*T*grad_accum_steps;
         model->mean_loss = mean_loss;
     } else {
@@ -2450,6 +2458,7 @@ void gpt2_free(GPT2 *model) {
     cudaCheck(cudaFree(model->inputs));
     cudaCheck(cudaFree(model->targets));
     cudaFreeHost(model->cpu_losses);
+    cudaFreeHost(model->cpu_losses_fp32);
 }
 
 // ----------------------------------------------------------------------------
@@ -2750,38 +2759,15 @@ int main(int argc, char *argv[]) {
             NvtxRange evaluation_range("evaluation");
             float eval_acc_norm = 0.0f;
             evalloader_reset(&eval_loader);
-            int eval_num_batches = eval_loader.end_example_index - eval_loader.start_example_index;
-            for (int i = 0; i < eval_num_batches; i++) {
+            for (int i = 0; i < eval_loader.num_batches; i++) {
                 evalloader_next_batch(&eval_loader);
                 gpt2_forward(&model, eval_loader.inputs, eval_loader.targets, B, T);
-                // so at this stage we have model->cpu_losses giving the loss at each (b,t) position
-                // now we want to iterate in each row (b=0..3) and average the loss at mask=1 positions
-                float min_loss;
-                int min_loss_index;
-                for (int b = 0; b < eval_loader.num_completions; b++) {
-                    float average_loss = 0.0f;
-                    int count = 0;
-                    for (int t = 0; t < T; t++) {
-                        char mask = eval_loader.mask[b * T + t];
-                        if (mask == 1) {
-                            average_loss += (float)model.cpu_losses[b * T + t];
-                            count++;
-                        }
-                    }
-                    if (count > 0) { average_loss /= count; }
-                    if (b == 0 || average_loss < min_loss) {
-                        min_loss = average_loss;
-                        min_loss_index = b;
-                    }
-                }
-                if (min_loss_index == eval_loader.label) {
-                    eval_acc_norm += 1.0f;
-                }
+                int correct = evalloader_stat_losses(&eval_loader, model.cpu_losses_fp32);
+                eval_acc_norm += (float)correct;
             }
             // careful because not all ranks may have the exact same allocation of number of examples
             eval_acc_norm = multi_gpu_cpu_float_sum(eval_acc_norm);
-            eval_acc_norm /= eval_loader.num_examples;
-            printf0("HellaSwag: %f\n", eval_acc_norm);
+            printf0("HellaSwag: %d/%d = %f\n", (int)eval_acc_norm, eval_loader.num_examples, eval_acc_norm / eval_loader.num_examples);
             logger_log_eval(&logger, step, eval_acc_norm);
         }
 
@@ -2895,6 +2881,7 @@ int main(int argc, char *argv[]) {
     // free and destroy everything
     cudaCheck(cudaEventDestroy(end));
     cudaCheck(cudaEventDestroy(start));
+    evalloader_free(&eval_loader);
     dataloader_free(&train_loader);
     dataloader_free(&val_loader);
     tokenizer_free(&tokenizer);
@@ -2903,7 +2890,6 @@ int main(int argc, char *argv[]) {
     free(gen_tokens);
     logger_free(&logger);
     multi_gpu_config_free(&multi_gpu_config);
-
     common_free(model);
     return 0;
 }

From da59861d3878eaa74737143228cd00a3aac344fe Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Wed, 22 May 2024 22:42:37 +0000
Subject: [PATCH 142/172] make hellaswag optional eval yay

---
 dev/data/hellaswag.py |  5 -----
 train_gpt2.cu         | 16 ++++++++++++++--
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/dev/data/hellaswag.py b/dev/data/hellaswag.py
index 464016674..f4395e978 100644
--- a/dev/data/hellaswag.py
+++ b/dev/data/hellaswag.py
@@ -101,16 +101,11 @@ def render_example(example):
 
 def iterate_examples(split):
     # there are 10,042 examples in total in val
-    n = 0
     download(split)
     with open(os.path.join(DATA_CACHE_DIR, f"hellaswag_{split}.jsonl"), "r") as f:
         for line in f:
             example = json.loads(line)
-            n += 1
             yield example
-            # DEBUGGING, TODO REMOVE
-            if n >= 101:
-                break
 
 @torch.no_grad()
 def evaluate(model_type, device):
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 11de22f10..62e701a6f 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2694,7 +2694,12 @@ int main(int argc, char *argv[]) {
     // build an EvalLoader for HellaSwag
     EvalLoader eval_loader;
     const char* hellaswag_path = "dev/data/hellaswag/hellaswag_val.bin";
-    evalloader_init(&eval_loader, hellaswag_path, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes);
+    const char hellaswag_available = access(hellaswag_path, F_OK) == 0;
+    if (hellaswag_available) {
+        evalloader_init(&eval_loader, hellaswag_path, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes);
+    }
+    printf0("| hellaswag available   | %-50s |\n", hellaswag_available ? "yes" : "no");
+    printf0("+-----------------------+----------------------------------------------------+\n");
 
     // pretty print in a table the multi-gpu configuration as well
     set_zero_configs(&multi_gpu_config, zero_stage, model.num_parameters);
@@ -2702,6 +2707,11 @@ int main(int argc, char *argv[]) {
     printf0("| zero_stage            | %-50d |\n", multi_gpu_config.zero_stage);
     printf0("+-----------------------+----------------------------------------------------+\n");
 
+    // prints outside of pretty table to here and below
+    if (!hellaswag_available) {
+        printf0("HellaSwag eval not found at %s, skipping its evaluation\n", hellaswag_path);
+        printf0("You can run `python dev/data/hellaswag.py` to export and use it.\n");
+    }
     // more prints related to allocations from gpt2_build_from_checkpoint down here to not mess up our table above
     printf0("num_parameters: %zu => bytes: %zu\n", model.num_parameters, model.num_parameters_bytes);
     printf0("allocated %d MiB for model parameters\n", (int)round(model.num_parameters_bytes / (1024 * 1024)));
@@ -2755,11 +2765,13 @@ int main(int argc, char *argv[]) {
         }
 
         // once in a while estimate HellaSwag accuracy
-        if (step % val_loss_every == 0 || last_step) {
+        if (hellaswag_available &&
+           (step % val_loss_every == 0 || last_step)) {
             NvtxRange evaluation_range("evaluation");
             float eval_acc_norm = 0.0f;
             evalloader_reset(&eval_loader);
             for (int i = 0; i < eval_loader.num_batches; i++) {
+                if (i % 10 == 0) { printf("evaluating HellaSwag: %d/%d\r", i, eval_loader.num_batches); }
                 evalloader_next_batch(&eval_loader);
                 gpt2_forward(&model, eval_loader.inputs, eval_loader.targets, B, T);
                 int correct = evalloader_stat_losses(&eval_loader, model.cpu_losses_fp32);

From d3cf9e211c2a606371d8fab411eae5719097703d Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Thu, 23 May 2024 00:49:32 +0000
Subject: [PATCH 143/172] fix bug free only if initted

---
 train_gpt2.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 62e701a6f..798fe763b 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2893,7 +2893,7 @@ int main(int argc, char *argv[]) {
     // free and destroy everything
     cudaCheck(cudaEventDestroy(end));
     cudaCheck(cudaEventDestroy(start));
-    evalloader_free(&eval_loader);
+    if (hellaswag_available) { evalloader_free(&eval_loader); }
     dataloader_free(&train_loader);
     dataloader_free(&val_loader);
     tokenizer_free(&tokenizer);

From d3f26951ef0a612c917217f992b49404fe8ae847 Mon Sep 17 00:00:00 2001
From: Ross Wheeler <ross@rosswheeler.com>
Date: Thu, 23 May 2024 00:06:18 -0700
Subject: [PATCH 144/172] Add glob() for windows

Tested with tinyshakespeare and fineweb
---
 dataloader.h | 91 +++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 75 insertions(+), 16 deletions(-)

diff --git a/dataloader.h b/dataloader.h
index cd38fe343..cd4caf47f 100644
--- a/dataloader.h
+++ b/dataloader.h
@@ -1,6 +1,8 @@
 /*
 Implements a medium simple DataLoader for a distributed training setup.
 */
+#ifndef DATALOADER_H
+#define DATALOADER_H
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -13,30 +15,85 @@ Implements a medium simple DataLoader for a distributed training setup.
 #include "utils.h"
 
 // ----------------------------------------------------------------------------
-// we need glob to list files matching a pattern
-// windows does not have glob, so we fall back on a very simple implementation
-// this implementation doesn't actually do a glob, it assumes that the "pattern"
-// is exactly the single file of interest
+// implementation of glob for Windows
 #ifndef _WIN32
 #include <glob.h>
 #else
 
 typedef struct glob_t {
-    size_t gl_pathc;
-    char **gl_pathv;
+    size_t gl_pathc;    // Count of matched pathnames
+    char **gl_pathv;    // List of matched pathnames
 } glob_t;
 
-int glob(const char *pattern, int flags, void *unused, glob_t *pglob) {
-    assert(strstr(pattern, "*") == NULL); // we don't support * here
-    pglob->gl_pathc = 1;
-    pglob->gl_pathv = (char **)malloc(sizeof(char *));
-    if (pglob->gl_pathv == NULL) { exit(EXIT_FAILURE); } // ??? oom?
-    pglob->gl_pathv[0] = (char *)pattern;
-    return 0;
+void replace_forward_slashes(char* str) {
+    while (*str) {
+        if (*str == '/') {
+            *str = '\\';
+        }
+        str++;
+    }
 }
 
-void globfree(glob_t* pglob) {
-    free(pglob->gl_pathv);
+void globfree(glob_t *pglob) {
+    for (size_t i = 0; i < pglob->gl_pathc; ++i) {
+		free(pglob->gl_pathv[i]); // Free the allocated memory for each filename
+    }
+	free(pglob->gl_pathv); // Free the allocated memory for the list of filenames
+}
+
+int glob(const char* pattern, int ignored_flags, int (*ignored_errfunc)(const char* epath, int eerrno), glob_t* pglob){
+    struct _finddata_t find_file_data;
+	char full_path[576]; // stored in pglob->gl_pathv[n]
+    char directory_path[512] = {0}; // Store the directory path from the pattern
+	char pattern_copy[512]; // Copy of the pattern to modify
+
+	strncpy_s(pattern_copy, sizeof(pattern_copy) - 1, pattern, sizeof(pattern_copy) - 1);
+
+    replace_forward_slashes (pattern_copy); // Replace forward slashes with backslashes
+    
+	if (strchr(pattern_copy, '\\') != NULL) {
+		strncpy_s(directory_path, sizeof(directory_path) - 1, pattern_copy, strrchr(pattern_copy, '\\') - pattern_copy + 1);
+		directory_path[strrchr(pattern_copy, '\\') - pattern_copy + 1] = '\0';
+	}
+	
+    // find the first file matching the pattern in the directory
+    intptr_t find_handle = _findfirst(pattern_copy, &find_file_data);
+
+    if (find_handle == -1) {
+        return 1; // No files found
+    }
+
+    size_t file_count = 0;
+    size_t max_files = 64000; // hard-coded limit for the number of files
+
+	pglob->gl_pathv = (char **) malloc(max_files * sizeof(char*)); // freed in globfree
+
+    if (pglob->gl_pathv == NULL) {
+        _findclose(find_handle);
+        return 2; // Memory allocation failed
+    }
+
+    do {
+        if (file_count >= max_files) {
+            _findclose(find_handle);
+			return 2; // Too many files found
+            }
+
+        snprintf(full_path, sizeof(full_path), "%s%s", directory_path, find_file_data.name);
+
+		pglob->gl_pathv[file_count] = _strdup(full_path); // freed in globfree
+
+        if (pglob->gl_pathv[file_count] == NULL) {
+            _findclose(find_handle);
+            return 2; // Memory allocation for filename failed
+        }
+        file_count++;
+    } while (_findnext(find_handle, &find_file_data) == 0);
+
+    _findclose(find_handle);
+
+    pglob->gl_pathc = file_count;
+    return 0;
 }
 #endif
 
@@ -460,4 +517,6 @@ void evalloader_free(EvalLoader *loader) {
     free(loader->mask);
     free(loader->label);
     fcloseCheck(loader->eval_file);
-}
\ No newline at end of file
+}
+
+#endif // DATALOADER_H
\ No newline at end of file

From d09631807ac0c594891d163a9129c381448cab13 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Thu, 23 May 2024 14:35:46 +0000
Subject: [PATCH 145/172] first draft of random init, crashes with some cuBLAS
 error, debugging

---
 train_gpt2.cu | 96 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 94 insertions(+), 2 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 798fe763b..31409a1bd 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2035,6 +2035,84 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
     model->recompute = 1; // default to recompute gelu during backward
 }
 
+void gpt2_build_from_random(GPT2 *model, int depth) {
+    // init random (training from scratch)
+
+    // parameterize the size of gpt2 based only on the depth of the model (num_layers)
+    model->config.num_layers = depth;
+    // follows GPT-2 sizes
+    int channels, num_heads;
+    if      (depth == 12) { channels = 12; num_heads = 12; } // gpt2 (124M)
+    else if (depth == 24) { channels = 16; num_heads = 16; } // gpt2-medium (350M)
+    else if (depth == 36) { channels = 20; num_heads = 20; } // gpt2-large (774M)
+    else if (depth == 48) { channels = 25; num_heads = 25; } // gpt2-xl (1558M)
+    else { fprintf(stderr, "Unsupported depth for now\n"); exit(EXIT_FAILURE); }
+    model->config.channels = channels;
+    model->config.num_heads = num_heads;
+    model->config.max_seq_len = 1024;
+    model->config.vocab_size = 50257;
+    model->config.padded_vocab_size = 50304; // padded to 128
+
+    // fill in all the parameter tensor dimensions and types
+    fill_in_parameter_sizes(model->param_elements, model->param_sizeof, model->config);
+    model->num_parameters = 0;
+    model->num_parameters_bytes = 0;
+    for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
+        model->num_parameters += model->param_elements[i];
+        model->num_parameters_bytes += model->param_elements[i] * model->param_sizeof[i];
+    }
+
+    // create memory for model parameters on the device
+    model->params_memory = malloc_and_point_parameters(&model->params, model->param_elements, model->param_sizeof);
+
+    // allocate and random init the memory for all the parameters with GPT-2 schema
+    // weights ~N(0, 0.02), biases 0, c_proj weights ~N(0, 0.02/(2*L)**0.5)
+    // NOTE: assuming all parameters are of the type floatX, could be relaxed later
+    unsigned long long init_rng_state = 42;
+    floatX* params_memory_cpu = (floatX*)mallocCheck(model->num_parameters_bytes);
+    memset(params_memory_cpu, 0, model->num_parameters_bytes);
+    // fill in all the weights with random values
+    size_t offset = 0;
+    float residual_scale = 1.0f / sqrtf(2.0f * model->config.num_layers);
+    for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
+        // hard-coding the positions of the weights tensors here
+        if (i == 0 || i == 1 || i == 2 || i == 4 || i == 6 || i == 8 || i == 10
+         || i == 12 || i == 14) {
+            // in GPT-2, the projections back into the residual stream are additionally
+            // scaled by 1/sqrt(2*L) for training stability
+            float scale = (i == 6 || i == 12) ? 0.02f * residual_scale : 0.02f;
+            for (size_t j = 0; j < model->param_elements[i]; j++) {
+                float f = random_f32(&init_rng_state); // random float in [0, 1]
+                f *= scale;
+                f -= 0.5f * scale; // mean 0
+                params_memory_cpu[offset + j] = (floatX)f;
+            }
+        }
+        offset += model->param_elements[i];
+    }
+    // copy them to GPU
+    cudaCheck(cudaMemcpy(model->params_memory, params_memory_cpu, model->num_parameters_bytes, cudaMemcpyHostToDevice));
+    free(params_memory_cpu);
+
+    // other inits and defaults
+    model->acts_memory = NULL;
+    model->grads_memory = NULL;
+    model->m_memory = NULL;
+    model->v_memory = NULL;
+    model->master_weights = NULL;
+    model->grads_acts_memory = NULL;
+    model->inputs = NULL;
+    model->targets = NULL;
+    model->cpu_losses = NULL;
+    model->cpu_losses_fp32 = NULL;
+    model->batch_size = 0;
+    model->seq_len = 0;
+    model->mean_loss = -1.0f; // -1.0f designates no loss
+    model->rng_state = 13371337;
+    model->use_master_weights = 1; // keep master weights copy in float for optim update?
+    model->recompute = 1; // default to recompute gelu during backward
+}
+
 void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, int grad_accum_steps=1) {
     NVTX_RANGE_FN();
     // targets are optional and could be NULL
@@ -2666,9 +2744,23 @@ int main(int argc, char *argv[]) {
     printf0("| precision             | %-50s |\n", precision_str);
     printf0("+-----------------------+----------------------------------------------------+\n");
 
-    // build the GPT-2 model from a checkpoint
+    // build the GPT-2 model
     GPT2 model;
-    gpt2_build_from_checkpoint(&model, load_filename);
+    // if load_filename is of the form "dX" where X is an integer (e.g. d12), then we build
+    // a random model with the depth of the model specified by X (e.g. 12). otherwise interpret
+    // this variable as a checkpoint filename, and load that checkpoint
+    assert(strlen(load_filename) >= 2);
+    if (load_filename[0] == 'd') {
+        int depth = atoi(load_filename + 1);
+        if (depth > 1 && depth <= 1000) { // we're not going to train models this big right? heh
+            gpt2_build_from_random(&model, depth);
+        } else {
+            exit(EXIT_FAILURE);
+        }
+    } else {
+        gpt2_build_from_checkpoint(&model, load_filename);
+    }
+
     model.use_master_weights = use_master_weights;
     model.recompute = recompute;
     printf0("| load_filename         | %-50s |\n", load_filename);

From e6a7d1d3e9de3b65fbb43d21b4e4590a9718a2c4 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Thu, 23 May 2024 14:59:25 +0000
Subject: [PATCH 146/172] allow the python script to also init from random and
 save those weights, so it's a good reference for our C implementation

---
 train_gpt2.py | 39 +++++++++++++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 6 deletions(-)

diff --git a/train_gpt2.py b/train_gpt2.py
index f844004d2..ce1c2549b 100644
--- a/train_gpt2.py
+++ b/train_gpt2.py
@@ -128,6 +128,21 @@ def __init__(self, config):
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
         self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying
 
+        # init all weights
+        self.apply(self._init_weights)
+        # apply special scaled init to the residual projections, per GPT-2 paper
+        for pn, p in self.named_parameters():
+            if pn.endswith('c_proj.weight'):
+                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+
     def forward(self, idx, targets=None, return_logits=True):
         device = idx.device
         b, t = idx.size()
@@ -395,7 +410,7 @@ def print0(*args, **kwargs):
     # python train_gpt2.py --inference_only 1 --write_tensors 0 --sequence_length 1024
     parser = argparse.ArgumentParser()
     parser.add_argument("--input_bin", type=str, default="dev/data/tinyshakespeare/tiny_shakespeare_val.bin", help="input .bin to train on")
-    parser.add_argument("--model", type=str, default="gpt2", help="gpt2|gpt2-medium|gpt2-large|gpt2-xl")
+    parser.add_argument("--model", type=str, default="gpt2", help="gpt2|gpt2-medium|gpt2-large|gpt2-xl|d12|d24|d36|d48")
     parser.add_argument("--write_tensors", type=int, default=1, help="write tensors to disk")
     parser.add_argument("--inference_only", type=int, default=0, help="only run inference")
     parser.add_argument("--dtype", type=str, default="float32", help="float32|float16|bfloat16")
@@ -412,8 +427,7 @@ def print0(*args, **kwargs):
     B, T = args.batch_size, args.sequence_length
     assert 1 <= T <= 1024
     assert args.dtype in {"float32", "float16", "bfloat16"}
-    assert args.model in {"gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"}
-    model_to_size = {"gpt2": "124M", "gpt2-medium": "355M", "gpt2-large": "774M", "gpt2-xl": "1558M"}
+    assert args.model in {"gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl", "d12", "d24", "d36", "d48"}
 
     # set up DDP (distributed data parallel). torchrun sets this env variable
     ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run?
@@ -480,8 +494,19 @@ def print0(*args, **kwargs):
     if master_process and args.write_tensors: # tokenizer is technically not tensors but ok
         write_tokenizer(enc, "gpt2_tokenizer.bin")
 
-    # load the GPT-2 model weights
-    model = GPT.from_pretrained(args.model)
+    # init the model, either from scratch or from OpenAI pretrained checkpoint
+    if args.model[0] == "d":
+        # from scratch (random weights)
+        model_config = {
+            "d12": GPTConfig(block_size=1024, vocab_size=50257, n_layer=12, n_head=12, n_embd=768),
+            "d24": GPTConfig(block_size=1024, vocab_size=50257, n_layer=24, n_head=16, n_embd=1024),
+            "d36": GPTConfig(block_size=1024, vocab_size=50257, n_layer=36, n_head=20, n_embd=1280),
+            "d48": GPTConfig(block_size=1024, vocab_size=50257, n_layer=48, n_head=25, n_embd=1600),
+        }[args.model]
+        model = GPT(model_config)
+    else:
+        # load the GPT-2 model weights
+        model = GPT.from_pretrained(args.model)
     model.train()
     model.to(device)
     if args.compile:
@@ -549,7 +574,9 @@ def get_batch():
         logits, loss = model(x, y)
         loss.backward()
         # save model params, in both float32 and bfloat16
-        model_size_str = model_to_size[args.model] # e.g. "124M"
+        model_to_size = {"gpt2": "124M", "gpt2-medium": "355M", "gpt2-large": "774M", "gpt2-xl": "1558M"}
+        model_to_size.update({f"d{d}": f"d{d}" for d in [12, 24, 36, 48]})
+        model_size_str = model_to_size[args.model] # e.g. "124M", or "d12"
         write_model(model, f"gpt2_{model_size_str}.bin", dtype="float32")
         write_model(model, f"gpt2_{model_size_str}_bf16.bin", dtype="bfloat16")
         # save x, y, logits, loss, and parameter gradients, for debugging C

From 70a9c75348dee80def7172e96326522abcbdf86b Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Thu, 23 May 2024 15:16:49 +0000
Subject: [PATCH 147/172] use pytorch rand and fix dumb bug lol

---
 rand.h        | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 train_gpt2.cu | 26 +++++++++-------
 2 files changed, 98 insertions(+), 12 deletions(-)

diff --git a/rand.h b/rand.h
index f69340d20..e60e5e6a9 100644
--- a/rand.h
+++ b/rand.h
@@ -1,3 +1,85 @@
+/*
+Mersenne Twisters implementation, numerically identical to torch.
+
+Example usage:
+
+    mt19937_state state;
+    manual_seed(&state, 137);
+    printf("%u\n", randint32(&state));
+    printf("%u\n", randint32(&state));
+    printf("%u\n", randint32(&state));
+    printf("%u\n", randint32(&state));
+    printf("%u\n", randint32(&state));
+
+    float t8[8];
+    normal_(t8, 8, 0, 1, &state);
+    for (int i = 0; i < 8; i++) {
+        printf("%f\n", t8[i]);
+    }
+    printf("%u\n", randint32(&state));
+
+    float t16[16];
+    normal_(t16, 16, 0, 1, &state);
+    for (int i = 0; i < 16; i++) {
+        printf("%f\n", t16[i]);
+    }
+    printf("%u\n", randint32(&state));
+
+PyTorch reference (producing identical results):
+
+    import torch
+    torch.manual_seed(137)
+    print(torch.randint(0, 0xFFFFFFFF, [1]).item())
+    print(torch.randint(0, 0xFFFFFFFF, [1]).item())
+    print(torch.randint(0, 0xFFFFFFFF, [1]).item())
+    print(torch.randint(0, 0xFFFFFFFF, [1]).item())
+    print(torch.randint(0, 0xFFFFFFFF, [1]).item())
+    t = torch.zeros(8);
+    t.normal_()
+    for i in range(len(t)) :
+        print(t[i].item())
+    print(torch.randint(0, 0xFFFFFFFF, [1]).item())
+    t = torch.zeros(16);
+    t.normal_()
+    for i in range(len(t)) :
+        print(t[i].item())
+    print(torch.randint(0, 0xFFFFFFFF, [1]).item())
+
+Both output:
+
+    4053805790
+    2173880614
+    380293709
+    1237255315
+    2986595568
+    0.7947664260864258
+    1.4369317293167114
+    - 0.2292192131280899
+    0.47556325793266296
+    - 0.6334410905838013
+    - 0.5791953802108765
+    - 0.0925704762339592
+    - 0.8659197092056274
+    2186503452
+    - 1.2813878059387207
+    - 2.646395683288574
+    - 0.06569503247737885
+    0.2180829495191574
+    - 0.46536165475845337
+    - 0.33108410239219666
+    2.5485482215881348
+    0.10425379872322083
+    0.8460659980773926
+    0.9462448358535767
+    - 0.2913765013217926
+    0.34313806891441345
+    - 1.1186704635620117
+    - 0.18305328488349915
+    - 2.3153159618377686
+    0.3961987793445587
+    2756748748
+*/
+
 #ifndef RAND_H
 #define RAND_H
 
@@ -81,7 +163,7 @@ void uniform_(float* data, unsigned int numel, float from, float to, mt19937_sta
     }
 }
 
-// Box�Muller transform
+// Box�Muller transform
 
 void normal_fill_16(float* data, float mean, float std, mt19937_state* state) {
     #define EPSILONE 1e-12
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 31409a1bd..edd316508 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -57,7 +57,9 @@ This reads & runs in fp32, B=4, T=64, LR=1e-4, val/sample never (200),
 // defines: dataloader_init, dataloader_reset, dataloader_next_batch, dataloader_free
 // defines: evalloader_init, evalloader_reset, evalloader_next_batch, evalloader_free
 #include "dataloader.h"
-
+// defines: manual_seed, normal_
+// numerically identical to PyTorch's torch.manual_seed and torch.normal
+#include "rand.h"
 // ----------------------------------------------------------------------------
 // CUDA precision settings
 
@@ -2042,10 +2044,10 @@ void gpt2_build_from_random(GPT2 *model, int depth) {
     model->config.num_layers = depth;
     // follows GPT-2 sizes
     int channels, num_heads;
-    if      (depth == 12) { channels = 12; num_heads = 12; } // gpt2 (124M)
-    else if (depth == 24) { channels = 16; num_heads = 16; } // gpt2-medium (350M)
-    else if (depth == 36) { channels = 20; num_heads = 20; } // gpt2-large (774M)
-    else if (depth == 48) { channels = 25; num_heads = 25; } // gpt2-xl (1558M)
+    if      (depth == 12) { channels = 768; num_heads = 12; } // gpt2 (124M)
+    else if (depth == 24) { channels = 1024; num_heads = 16; } // gpt2-medium (350M)
+    else if (depth == 36) { channels = 1280; num_heads = 20; } // gpt2-large (774M)
+    else if (depth == 48) { channels = 1600; num_heads = 25; } // gpt2-xl (1558M)
     else { fprintf(stderr, "Unsupported depth for now\n"); exit(EXIT_FAILURE); }
     model->config.channels = channels;
     model->config.num_heads = num_heads;
@@ -2068,7 +2070,8 @@ void gpt2_build_from_random(GPT2 *model, int depth) {
     // allocate and random init the memory for all the parameters with GPT-2 schema
     // weights ~N(0, 0.02), biases 0, c_proj weights ~N(0, 0.02/(2*L)**0.5)
     // NOTE: assuming all parameters are of the type floatX, could be relaxed later
-    unsigned long long init_rng_state = 42;
+    mt19937_state rng_state;
+    manual_seed(&rng_state, 42);
     floatX* params_memory_cpu = (floatX*)mallocCheck(model->num_parameters_bytes);
     memset(params_memory_cpu, 0, model->num_parameters_bytes);
     // fill in all the weights with random values
@@ -2081,12 +2084,13 @@ void gpt2_build_from_random(GPT2 *model, int depth) {
             // in GPT-2, the projections back into the residual stream are additionally
             // scaled by 1/sqrt(2*L) for training stability
             float scale = (i == 6 || i == 12) ? 0.02f * residual_scale : 0.02f;
-            for (size_t j = 0; j < model->param_elements[i]; j++) {
-                float f = random_f32(&init_rng_state); // random float in [0, 1]
-                f *= scale;
-                f -= 0.5f * scale; // mean 0
-                params_memory_cpu[offset + j] = (floatX)f;
+            int n = model->param_elements[i];
+            float *fp32_buffer = (float*)mallocCheck(n * sizeof(float));
+            normal_(fp32_buffer, n, 0.0f, scale, &rng_state);
+            for (size_t j = 0; j < n; j++) {
+                params_memory_cpu[offset + j] = (floatX)fp32_buffer[j];
             }
+            free(fp32_buffer);
         }
         offset += model->param_elements[i];
     }

From 86682af9a9286714eb7879047ca59100a4cd485f Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Thu, 23 May 2024 17:50:33 +0000
Subject: [PATCH 148/172] llm.c matches pytorch init from scratch exactly now

---
 train_gpt2.cu | 60 +++++++++++++++++++++++++++++++++++----------------
 train_gpt2.py | 20 +++++++++++------
 2 files changed, 55 insertions(+), 25 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index edd316508..8fd0093e7 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2063,37 +2063,61 @@ void gpt2_build_from_random(GPT2 *model, int depth) {
         model->num_parameters += model->param_elements[i];
         model->num_parameters_bytes += model->param_elements[i] * model->param_sizeof[i];
     }
-
     // create memory for model parameters on the device
     model->params_memory = malloc_and_point_parameters(&model->params, model->param_elements, model->param_sizeof);
 
     // allocate and random init the memory for all the parameters with GPT-2 schema
     // weights ~N(0, 0.02), biases 0, c_proj weights ~N(0, 0.02/(2*L)**0.5)
     // NOTE: assuming all parameters are of the type floatX, could be relaxed later
-    mt19937_state rng_state;
-    manual_seed(&rng_state, 42);
+    mt19937_state init_rng;
+    manual_seed(&init_rng, 42);
     floatX* params_memory_cpu = (floatX*)mallocCheck(model->num_parameters_bytes);
     memset(params_memory_cpu, 0, model->num_parameters_bytes);
     // fill in all the weights with random values
-    size_t offset = 0;
     float residual_scale = 1.0f / sqrtf(2.0f * model->config.num_layers);
-    for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
-        // hard-coding the positions of the weights tensors here
-        if (i == 0 || i == 1 || i == 2 || i == 4 || i == 6 || i == 8 || i == 10
-         || i == 12 || i == 14) {
-            // in GPT-2, the projections back into the residual stream are additionally
-            // scaled by 1/sqrt(2*L) for training stability
-            float scale = (i == 6 || i == 12) ? 0.02f * residual_scale : 0.02f;
-            int n = model->param_elements[i];
-            float *fp32_buffer = (float*)mallocCheck(n * sizeof(float));
-            normal_(fp32_buffer, n, 0.0f, scale, &rng_state);
-            for (size_t j = 0; j < n; j++) {
-                params_memory_cpu[offset + j] = (floatX)fp32_buffer[j];
+    // we have to init all these tensors exactly in the order that PyTorch initializes them
+    // so that we can match them up and get correctness and exactly the same initial conditions
+    size_t L = model->config.num_layers;
+    size_t offset = 0;
+    for (int l = 0; l < L; l++) {
+        offset = 0;
+        for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
+            // the layernorm parameters are all initialized to 1
+            if (l == 0 && (i == 2 || i == 8 || i == 14)) { // only at l = 0 to init these just once
+                for (size_t j = 0; j < model->param_elements[i]; j++) {
+                    params_memory_cpu[offset + j] = 1.0f;
+                }
+            }
+            // weights tensors are handled here
+            if ((l == 0 && (i == 0 || i == 1)) // only at l = 0, init the wte and wpe tensors
+              || i == 4 || i == 6 || i == 10 || i == 12) {
+                int n = model->param_elements[i];
+                size_t layer_offset = 0;
+                if (i == 0) {
+                    // for wte tensor (padded vocab) override to init V instead of Vp rows
+                    n = model->config.vocab_size * model->config.channels;
+                }
+                if (i == 4 || i == 6 || i == 10 || i == 12) {
+                    // weight tensors, we are only initializing layer l
+                    assert(n % L == 0);
+                    n = n / L;
+                    layer_offset = l * n;
+                }
+                // in GPT-2, the projections back into the residual stream are additionally
+                // scaled by 1/sqrt(2*L) for training stability
+                float scale = (i == 6 || i == 12) ? 0.02f * residual_scale : 0.02f;
+                // okay let's draw the random numbers and write them
+                float *fp32_buffer = (float*)mallocCheck(n * sizeof(float));
+                normal_(fp32_buffer, n, 0.0f, scale, &init_rng);
+                for (size_t j = 0; j < n; j++) {
+                    params_memory_cpu[offset + layer_offset + j] = (floatX)fp32_buffer[j];
+                }
+                free(fp32_buffer);
             }
-            free(fp32_buffer);
+            offset += model->param_elements[i];
         }
-        offset += model->param_elements[i];
     }
+
     // copy them to GPU
     cudaCheck(cudaMemcpy(model->params_memory, params_memory_cpu, model->num_parameters_bytes, cudaMemcpyHostToDevice));
     free(params_memory_cpu);
diff --git a/train_gpt2.py b/train_gpt2.py
index ce1c2549b..66a9bbd81 100644
--- a/train_gpt2.py
+++ b/train_gpt2.py
@@ -47,6 +47,7 @@ def __init__(self, config):
         self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
         # output projection
         self.c_proj = nn.Linear(config.n_embd, config.n_embd)
+        self.c_proj.LLMC_RESIDUAL_SCALE_FLAG = 1
         # regularization
         self.n_head = config.n_head
         self.n_embd = config.n_embd
@@ -84,6 +85,7 @@ def __init__(self, config):
         self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd)
         self.gelu    = NewGELU()
         self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd)
+        self.c_proj.LLMC_RESIDUAL_SCALE_FLAG = 1
 
     def forward(self, x):
         x = self.c_fc(x)
@@ -126,22 +128,26 @@ def __init__(self, config):
             ln_f = nn.LayerNorm(config.n_embd),
         ))
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.lm_head.LLMC_SKIP_INIT = 1 # don't init this one, we will tie weights
         self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying
 
-        # init all weights
+        # init all weights, use a torch rng object to be very careful
+        self.init_rng = torch.Generator()
+        self.init_rng.manual_seed(42)
         self.apply(self._init_weights)
-        # apply special scaled init to the residual projections, per GPT-2 paper
-        for pn, p in self.named_parameters():
-            if pn.endswith('c_proj.weight'):
-                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
 
     def _init_weights(self, module):
         if isinstance(module, nn.Linear):
-            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            # apply special scaled init to the residual projections, per GPT-2 paper
+            std = 0.02 if not hasattr(module, 'LLMC_RESIDUAL_SCALE_FLAG') else 0.02/math.sqrt(2 * self.config.n_layer)
+            # we want to skip initializing lm_head, which shares parameters with wte
+            # and wte was already initialized down below during the Embedding init
+            if not hasattr(module, 'LLMC_SKIP_INIT'):
+                torch.nn.init.normal_(module.weight, mean=0.0, std=std, generator=self.init_rng)
             if module.bias is not None:
                 torch.nn.init.zeros_(module.bias)
         elif isinstance(module, nn.Embedding):
-            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02, generator=self.init_rng)
 
     def forward(self, idx, targets=None, return_logits=True):
         device = idx.device

From 1f91bfc44206dfe89bef67ab353f81ed75f40568 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Thu, 23 May 2024 18:19:25 +0000
Subject: [PATCH 149/172] fix small bug on eval logging

---
 train_gpt2.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 8fd0093e7..585e82a0d 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2635,9 +2635,9 @@ void logger_init(Logger *logger, const char *filename) {
     if (filename != NULL) { logger->logfile = fopenCheck(filename, "w"); }
 }
 
-void logger_log_eval(Logger *logger, int step, float val_loss) {
+void logger_log_eval(Logger *logger, int step, float val) {
     if (logger->logfile != NULL) {
-        fprintf(logger->logfile, "s:%d eval:%.4f\n", step, val_loss);
+        fprintf(logger->logfile, "s:%d eval:%.4f\n", step, val);
     }
 }
 
@@ -2900,7 +2900,7 @@ int main(int argc, char *argv[]) {
             // careful because not all ranks may have the exact same allocation of number of examples
             eval_acc_norm = multi_gpu_cpu_float_sum(eval_acc_norm);
             printf0("HellaSwag: %d/%d = %f\n", (int)eval_acc_norm, eval_loader.num_examples, eval_acc_norm / eval_loader.num_examples);
-            logger_log_eval(&logger, step, eval_acc_norm);
+            logger_log_eval(&logger, step, eval_acc_norm / eval_loader.num_examples);
         }
 
         // once in a while do model inference to print generated text

From 5f87b13f34de9bc5e2698d1bec2347f88ceb67ba Mon Sep 17 00:00:00 2001
From: otabuzzman <iuergen.schuck@gmail.com>
Date: Thu, 23 May 2024 21:15:01 +0200
Subject: [PATCH 150/172] Update documentation with Swift port reference

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index f77870b25..7b9c2d4fe 100644
--- a/README.md
+++ b/README.md
@@ -403,6 +403,9 @@ Lastly, I will be a lot more sensitive to complexity in the root folder of the p
   -  [llm.rs](https://github.com/yijunyu/llm.rs) by @[Yijun Yu](https://github.com/yijunyu): a Rust rewrite with the aim to have same performance
   -  [llm.rs](https://github.com/ToJen/llm.rs) by @[ToJen](https://github.com/ToJen): a Rust port of this project
 
+- Swift
+  - [llm.swift](https://github.com/otabuzzman/llm.swift) by @[otabuzzman](https://github.com/otabuzzman): a Swift port of this project
+
 - Zig
   - [llm.zig](https://github.com/Saimirbaci/llm.zig) by @[saimirbaci](https://github.com/Saimirbaci): a Zig port of this project
 

From 3cb2812774b344af1224197580af55fee2063683 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Thu, 23 May 2024 19:27:04 +0000
Subject: [PATCH 151/172] skip hellaswag eval on step 0 i think... not sure but
 ok for now

---
 train_gpt2.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 585e82a0d..1a025f2cb 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2886,7 +2886,7 @@ int main(int argc, char *argv[]) {
 
         // once in a while estimate HellaSwag accuracy
         if (hellaswag_available &&
-           (step % val_loss_every == 0 || last_step)) {
+           ((step > 0 && step % val_loss_every == 0) || last_step)) {
             NvtxRange evaluation_range("evaluation");
             float eval_acc_norm = 0.0f;
             evalloader_reset(&eval_loader);

From 949d71a3d22869e55d87ac2d927fcf2dcabc09f3 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Thu, 23 May 2024 20:32:56 +0000
Subject: [PATCH 152/172] only rank 0 logs

---
 train_gpt2.cu | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 1a025f2cb..590381541 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2630,9 +2630,12 @@ typedef struct {
 } Logger;
 
 void logger_init(Logger *logger, const char *filename) {
-    logger->flush_every = 20;
+    logger->flush_every = 10;
     logger->logfile = NULL;
-    if (filename != NULL) { logger->logfile = fopenCheck(filename, "w"); }
+    // only rank 0 process will log
+    if (filename != NULL && multi_gpu_config.process_rank == 0) {
+        logger->logfile = fopenCheck(filename, "w");
+    }
 }
 
 void logger_log_eval(Logger *logger, int step, float val) {
@@ -2650,7 +2653,7 @@ void logger_log_val(Logger *logger, int step, float val_loss) {
 void logger_log_train(Logger *logger, int step, float train_loss) {
     if (logger->logfile != NULL) {
         fprintf(logger->logfile, "s:%d trl:%.4f\n", step, train_loss);
-        if (step % 10 == 0) { fflush(logger->logfile); }
+        if (step % logger->flush_every == 0) { fflush(logger->logfile); }
     }
 }
 
@@ -2844,9 +2847,11 @@ int main(int argc, char *argv[]) {
             B, T, multi_gpu_config.num_processes, total_batch_size);
     printf0("=> setting grad_accum_steps=%d\n", grad_accum_steps);
 
-    // set up the Logger & Tokenizer
+    // set up the Logger
     Logger logger;
     logger_init(&logger, output_log_file);
+
+    // set up the Tokenizer
     Tokenizer tokenizer;
     tokenizer_init(&tokenizer, "gpt2_tokenizer.bin");
 

From 645869b6f7f670c84e4a5d6c4945fa0f8eeb1338 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Thu, 23 May 2024 20:45:59 +0000
Subject: [PATCH 153/172] add weight decay -c option and be more careful in our
 tests of correctness, our weight decay didn't match to pytorch. also modify
 the betas in AdamW to be consistent with those used in GPT-3 training

---
 test_gpt2.cu  | 18 +++++++++---------
 train_gpt2.cu |  6 +++++-
 train_gpt2.py |  2 +-
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/test_gpt2.cu b/test_gpt2.cu
index 50a291f18..247cd322c 100644
--- a/test_gpt2.cu
+++ b/test_gpt2.cu
@@ -274,7 +274,7 @@ int main(int argc, char *argv[]) {
             allok = allok & check_tensor(tensors1[15], tensors2[15], C, "lnfb", 2e-2f);
         }
 
-        gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.01f, 1.f, step+1, &multi_gpu_config);
+        gpt2_update(&model, 1e-4f, 0.9f, 0.95f, 1e-8f, 0.0f, 1.0f, step+1, &multi_gpu_config);
 
         // print the timing information at the end
         printf("step %d: loss %f (took %f ms)\n", step+1, model.mean_loss, time_elapsed_s * 1000);
@@ -285,14 +285,14 @@ int main(int argc, char *argv[]) {
     float expected_losses[10] = {
         5.2700,
         4.0607,
-        3.3166,
-        2.7115,
-        2.1702,
-        1.6349,
-        1.1419,
-        0.7038,
-        0.3769,
-        0.1743
+        3.3202,
+        2.7176,
+        2.1811,
+        1.6538,
+        1.1680,
+        0.7367,
+        0.4008,
+        0.1874
     };
 
     // compare
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 590381541..65afa7454 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2675,6 +2675,7 @@ void error_usage() {
     fprintf(stderr, "  -t <int>    sequence length T (default = 1024)\n");
     fprintf(stderr, "  -d <int>    total desired batch size (default = B * T * num_processes, i.e. no grad accumulation\n");
     fprintf(stderr, "  -l <float>  learning rate (default = 3e-4f)\n");
+    fprintf(stderr, "  -c <float>  weight decay (default = 0.0f)\n");
     fprintf(stderr, "  -x <int>    max_steps of optimization to run (-1 (default) = disable, run 1 epoch)\n");
     fprintf(stderr, "  -v <int>    val_loss_every, how often we evaluate val loss (default = 20)\n");
     fprintf(stderr, "  -m <int>    val_max_batches, up to how many val batches to estimate val loss? (default = 20)\n");
@@ -2702,6 +2703,7 @@ int main(int argc, char *argv[]) {
     int T = 1024; // sequence length max
     int total_batch_size = -1; // will be calculated down below later, if not provided
     float learning_rate = 3e-4f;
+    float weight_decay = 0.0f;
     int val_loss_every = 20; // every how many steps do we eval validation loss?
     int val_max_batches = 20; // how many batches max do we eval for validation loss?
     int sample_every = 20; // every how many steps to do inference?
@@ -2726,6 +2728,7 @@ int main(int argc, char *argv[]) {
         else if (argv[i][1] == 't') { T = atoi(argv[i+1]); }
         else if (argv[i][1] == 'd') { total_batch_size = atoi(argv[i+1]); }
         else if (argv[i][1] == 'l') { learning_rate = atof(argv[i+1]); }
+        else if (argv[i][1] == 'c') { weight_decay = atof(argv[i+1]); }
         else if (argv[i][1] == 'x') { max_steps = atoi(argv[i+1]); }
         else if (argv[i][1] == 'v') { val_loss_every = atoi(argv[i+1]); }
         else if (argv[i][1] == 'm') { val_max_batches = atoi(argv[i+1]); }
@@ -2754,6 +2757,7 @@ int main(int argc, char *argv[]) {
     printf0("| sequence length T     | %-50d |\n", T);
     printf0("| total batch size      | %-50d |\n", total_batch_size);
     printf0("| learning rate         | %-50e |\n", learning_rate);
+    printf0("| weight decay          | %-50e |\n", weight_decay);
     printf0("| grad_clip             | %-50e |\n", grad_clip);
     printf0("| max_steps             | %-50d |\n", max_steps);
     printf0("| val_loss_every        | %-50d |\n", val_loss_every);
@@ -2982,7 +2986,7 @@ int main(int argc, char *argv[]) {
         model.mean_loss = lossf;
         // update the parameters
         gpt2_multi_gpu_accumulate(&model, &multi_gpu_config);
-        float grad_norm = gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, grad_clip, step+1, &multi_gpu_config);
+        float grad_norm = gpt2_update(&model, learning_rate, 0.9f, 0.95f, 1e-8f, weight_decay, grad_clip, step+1, &multi_gpu_config);
         gpt2_multi_gpu_gather(&model, &multi_gpu_config);
         // zero out the gradients for the next iteration
         gpt2_zero_grad(&model);
diff --git a/train_gpt2.py b/train_gpt2.py
index 66a9bbd81..298ab7022 100644
--- a/train_gpt2.py
+++ b/train_gpt2.py
@@ -599,7 +599,7 @@ def get_batch():
 
     # init the optimizer
     adam_use_fused = device == "cuda" # only works on CUDA (?)
-    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, fused=adam_use_fused)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, betas=(0.9, 0.95), weight_decay=0.0, fused=adam_use_fused)
 
     if device == "cuda":
         torch.cuda.reset_peak_memory_stats()

From 661975cc3c6a0e1684dcc91430f4e0fb4ebbdf23 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Thu, 23 May 2024 20:57:04 +0000
Subject: [PATCH 154/172] add learning rate warmup option

---
 train_gpt2.cu | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 65afa7454..507139084 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2675,6 +2675,7 @@ void error_usage() {
     fprintf(stderr, "  -t <int>    sequence length T (default = 1024)\n");
     fprintf(stderr, "  -d <int>    total desired batch size (default = B * T * num_processes, i.e. no grad accumulation\n");
     fprintf(stderr, "  -l <float>  learning rate (default = 3e-4f)\n");
+    fprintf(stderr, "  -u <int>    learning rate warmup iterations (default = 0, no warmup)\n");
     fprintf(stderr, "  -c <float>  weight decay (default = 0.0f)\n");
     fprintf(stderr, "  -x <int>    max_steps of optimization to run (-1 (default) = disable, run 1 epoch)\n");
     fprintf(stderr, "  -v <int>    val_loss_every, how often we evaluate val loss (default = 20)\n");
@@ -2703,6 +2704,7 @@ int main(int argc, char *argv[]) {
     int T = 1024; // sequence length max
     int total_batch_size = -1; // will be calculated down below later, if not provided
     float learning_rate = 3e-4f;
+    int warmup_iterations = 0;
     float weight_decay = 0.0f;
     int val_loss_every = 20; // every how many steps do we eval validation loss?
     int val_max_batches = 20; // how many batches max do we eval for validation loss?
@@ -2728,6 +2730,7 @@ int main(int argc, char *argv[]) {
         else if (argv[i][1] == 't') { T = atoi(argv[i+1]); }
         else if (argv[i][1] == 'd') { total_batch_size = atoi(argv[i+1]); }
         else if (argv[i][1] == 'l') { learning_rate = atof(argv[i+1]); }
+        else if (argv[i][1] == 'u') { warmup_iterations = atoi(argv[i+1]); }
         else if (argv[i][1] == 'c') { weight_decay = atof(argv[i+1]); }
         else if (argv[i][1] == 'x') { max_steps = atoi(argv[i+1]); }
         else if (argv[i][1] == 'v') { val_loss_every = atoi(argv[i+1]); }
@@ -2757,6 +2760,7 @@ int main(int argc, char *argv[]) {
     printf0("| sequence length T     | %-50d |\n", T);
     printf0("| total batch size      | %-50d |\n", total_batch_size);
     printf0("| learning rate         | %-50e |\n", learning_rate);
+    printf0("| warmup iterations     | %-50d |\n", warmup_iterations);
     printf0("| weight decay          | %-50e |\n", weight_decay);
     printf0("| grad_clip             | %-50e |\n", grad_clip);
     printf0("| max_steps             | %-50d |\n", max_steps);
@@ -2986,7 +2990,14 @@ int main(int argc, char *argv[]) {
         model.mean_loss = lossf;
         // update the parameters
         gpt2_multi_gpu_accumulate(&model, &multi_gpu_config);
-        float grad_norm = gpt2_update(&model, learning_rate, 0.9f, 0.95f, 1e-8f, weight_decay, grad_clip, step+1, &multi_gpu_config);
+        // learning rate schedule
+        float step_learning_rate = learning_rate;
+        if (warmup_iterations > 0) {
+            float lr_scale = fminf(1.0f, (float)(step + 1) / warmup_iterations);
+            step_learning_rate *= lr_scale;
+        }
+        // update the model parameters
+        float grad_norm = gpt2_update(&model, step_learning_rate, 0.9f, 0.95f, 1e-8f, weight_decay, grad_clip, step+1, &multi_gpu_config);
         gpt2_multi_gpu_gather(&model, &multi_gpu_config);
         // zero out the gradients for the next iteration
         gpt2_zero_grad(&model);
@@ -3008,8 +3019,8 @@ int main(int argc, char *argv[]) {
             bias_corrected_ema_tokens_per_second = ema_tokens_per_second / (1.0f - powf(0.95f, step));
         }
         float accumulated_loss = multi_gpu_config.num_processes == 1 ? model.mean_loss : model.accumulated_mean_loss;
-        printf0("step %4d/%d: train loss %f norm %.4f (%.2f ms, %.0f tok/s)\n",
-                step + 1, train_num_batches, accumulated_loss, grad_norm,
+        printf0("step %4d/%d: train loss %f norm %.4f lr %.2e (%.2f ms, %.0f tok/s)\n",
+                step + 1, train_num_batches, accumulated_loss, grad_norm, step_learning_rate,
                 time_elapsed_ms, bias_corrected_ema_tokens_per_second);
         logger_log_train(&logger, step, model.mean_loss);
 

From 64b6a146798d7e03c2354ecb599eaa8c91820551 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Thu, 23 May 2024 21:19:00 +0000
Subject: [PATCH 155/172] add learning rate decay schedule, now we have the
 full scheduler implemented

---
 train_gpt2.cu | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 507139084..ce1fa30cc 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2676,6 +2676,7 @@ void error_usage() {
     fprintf(stderr, "  -d <int>    total desired batch size (default = B * T * num_processes, i.e. no grad accumulation\n");
     fprintf(stderr, "  -l <float>  learning rate (default = 3e-4f)\n");
     fprintf(stderr, "  -u <int>    learning rate warmup iterations (default = 0, no warmup)\n");
+    fprintf(stderr, "  -q <float>  learning rate decay: final fraction, at end of training (default = 1.0 (no decay))\n");
     fprintf(stderr, "  -c <float>  weight decay (default = 0.0f)\n");
     fprintf(stderr, "  -x <int>    max_steps of optimization to run (-1 (default) = disable, run 1 epoch)\n");
     fprintf(stderr, "  -v <int>    val_loss_every, how often we evaluate val loss (default = 20)\n");
@@ -2705,6 +2706,7 @@ int main(int argc, char *argv[]) {
     int total_batch_size = -1; // will be calculated down below later, if not provided
     float learning_rate = 3e-4f;
     int warmup_iterations = 0;
+    float final_learning_rate_frac = 1.0f; // final fraction of learning rate, at end of training
     float weight_decay = 0.0f;
     int val_loss_every = 20; // every how many steps do we eval validation loss?
     int val_max_batches = 20; // how many batches max do we eval for validation loss?
@@ -2731,6 +2733,7 @@ int main(int argc, char *argv[]) {
         else if (argv[i][1] == 'd') { total_batch_size = atoi(argv[i+1]); }
         else if (argv[i][1] == 'l') { learning_rate = atof(argv[i+1]); }
         else if (argv[i][1] == 'u') { warmup_iterations = atoi(argv[i+1]); }
+        else if (argv[i][1] == 'q') { final_learning_rate_frac = atof(argv[i+1]); }
         else if (argv[i][1] == 'c') { weight_decay = atof(argv[i+1]); }
         else if (argv[i][1] == 'x') { max_steps = atoi(argv[i+1]); }
         else if (argv[i][1] == 'v') { val_loss_every = atoi(argv[i+1]); }
@@ -2745,6 +2748,8 @@ int main(int argc, char *argv[]) {
         else if (argv[i][1] == 'r') { recompute = atoi(argv[i+1]); }
         else { error_usage(); }
     }
+    // should do a bit more error checking here
+    assert(warmup_iterations >= 0);
     // calculate a sensible default for total batch size by assuming no gradient accumulation
     if (total_batch_size == -1) { total_batch_size = B * T * multi_gpu_config.num_processes; }
     // if we're only overfitting a single batch for debugging, let's overfit the first batch
@@ -2759,8 +2764,9 @@ int main(int argc, char *argv[]) {
     printf0("| micro batch size B    | %-50d |\n", B);
     printf0("| sequence length T     | %-50d |\n", T);
     printf0("| total batch size      | %-50d |\n", total_batch_size);
-    printf0("| learning rate         | %-50e |\n", learning_rate);
+    printf0("| learning rate (LR)    | %-50e |\n", learning_rate);
     printf0("| warmup iterations     | %-50d |\n", warmup_iterations);
+    printf0("| final LR fraction     | %-50e |\n", final_learning_rate_frac);
     printf0("| weight decay          | %-50e |\n", weight_decay);
     printf0("| grad_clip             | %-50e |\n", grad_clip);
     printf0("| max_steps             | %-50d |\n", max_steps);
@@ -2990,11 +2996,17 @@ int main(int argc, char *argv[]) {
         model.mean_loss = lossf;
         // update the parameters
         gpt2_multi_gpu_accumulate(&model, &multi_gpu_config);
-        // learning rate schedule
+        // learning rate schedule: warmup linearly to max LR, then cosine decay to LR * final_learning_rate_frac
         float step_learning_rate = learning_rate;
-        if (warmup_iterations > 0) {
-            float lr_scale = fminf(1.0f, (float)(step + 1) / warmup_iterations);
-            step_learning_rate *= lr_scale;
+        if (step < warmup_iterations) {
+            step_learning_rate = learning_rate * ((float)(step + 1)) / warmup_iterations;
+        } else {
+            float decay_ratio = ((float)(step - warmup_iterations)) / (train_num_batches - warmup_iterations);
+            assert(0.0f <= decay_ratio && decay_ratio <= 1.0f);
+            float coeff = 0.5f * (1.0f + cosf(M_PI * decay_ratio)); // coeff starts at 1 and goes to 0
+            assert(0.0f <= coeff && coeff <= 1.0f);
+            float min_lr = learning_rate * final_learning_rate_frac;
+            step_learning_rate = min_lr + coeff * (learning_rate - min_lr);
         }
         // update the model parameters
         float grad_norm = gpt2_update(&model, step_learning_rate, 0.9f, 0.95f, 1e-8f, weight_decay, grad_clip, step+1, &multi_gpu_config);

From 032e76c25974855b3c9282e07b98c232082d314b Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Fri, 24 May 2024 01:02:21 +0000
Subject: [PATCH 156/172] start putting llm.c and pytorch right next to each
 other, identical training runs with identical results and prints. almost

---
 train_gpt2.cu |  4 ++--
 train_gpt2.py | 29 ++++++++++++++++++++---------
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index ce1fa30cc..54a0c4fff 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -3022,10 +3022,10 @@ int main(int argc, char *argv[]) {
         float time_elapsed_ms;
         cudaCheck(cudaEventElapsedTime(&time_elapsed_ms, start, end));
         size_t tokens_processed = (size_t)multi_gpu_config.num_processes * B * T * grad_accum_steps;
-        float tokens_per_second = tokens_processed / time_elapsed_ms * 1000.0;
+        float tokens_per_second = tokens_processed / time_elapsed_ms * 1000.0f;
         float bias_corrected_ema_tokens_per_second = tokens_per_second; // by default set to non-ema version
         if (step > 0) { // consider the first batch to be a warmup (e.g. cuBLAS/cuDNN initialisation)
-            total_sum_iteration_time_s += time_elapsed_ms / 1000.0;
+            total_sum_iteration_time_s += time_elapsed_ms / 1000.0f;
             // smooth out the tok/s with an exponential moving average, and bias correct just like in AdamW
             ema_tokens_per_second = 0.95f * ema_tokens_per_second + 0.05f * tokens_per_second;
             bias_corrected_ema_tokens_per_second = ema_tokens_per_second / (1.0f - powf(0.95f, step));
diff --git a/train_gpt2.py b/train_gpt2.py
index 298ab7022..d32ab5042 100644
--- a/train_gpt2.py
+++ b/train_gpt2.py
@@ -429,6 +429,7 @@ def print0(*args, **kwargs):
     parser.add_argument("--sequence_length", type=int, default=64, help="sequence length")
     parser.add_argument("--total_batch_size", type=int, default=256, help="total desired batch size, in units of #tokens")
     parser.add_argument("--grad_clip", type=float, default=1.0, help="maximum gradient magnitude")
+    parser.add_argument("--overfit_single_batch", type=int, default=1, help="overfit just one batch of data")
     args = parser.parse_args()
     B, T = args.batch_size, args.sequence_length
     assert 1 <= T <= 1024
@@ -447,8 +448,10 @@ def print0(*args, **kwargs):
         device = f'cuda:{ddp_local_rank}'
         torch.cuda.set_device(device)
         master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
-        seed_offset = ddp_rank # each process gets a different seed
+        seed_offset = 0 # each process gets the exact same seed
     else:
+        ddp_rank = 0
+        ddp_local_rank = 0
         ddp_world_size = 1
         master_process = True
         seed_offset = 0
@@ -469,8 +472,9 @@ def print0(*args, **kwargs):
     tokens_per_fwdbwd = B * T * ddp_world_size
     assert args.total_batch_size % tokens_per_fwdbwd == 0
     grad_accum_steps = args.total_batch_size // tokens_per_fwdbwd
-    print(f"total desired batch size: {args.total_batch_size}")
-    print(f"=> calculated gradient accumulation steps: {grad_accum_steps}")
+    if master_process:
+        print(f"total desired batch size: {args.total_batch_size}")
+        print(f"=> calculated gradient accumulation steps: {grad_accum_steps}")
 
     # set up a context manager following the desired dtype and device
     ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[args.dtype]
@@ -557,18 +561,20 @@ def print0(*args, **kwargs):
     def get_batch():
         assert B*T+1 <= len(tokens), "not enough tokens"
         # for 338,025 tokens. E.g. with B=8 T=1024, this will yield 41 batches before looping
-        i = 0
+        i = B*T*ddp_rank
         while True:
             x = tokens[i:i+B*T].view(B, T)
             y = tokens[i+1:i+B*T+1].view(B, T)
             yield x, y
-            i += B*T
+            i += B*T*ddp_world_size
             if i + B*T + 1 >= len(tokens):
                 i = 0 # in prod we'd want to randomize the start point a bit
+                print("We do not expect to reach here in PyTorch right now")
+                import sys; sys.exit()
 
-    # fetch one batch of data, which we will overfit to
+    # fetch one batch of data
     data_iter = iter(get_batch())
-    x, y = next(data_iter) # we'll overfit this batch below
+    x, y = next(data_iter)
     x = x.to(device)
     y = y.to(device)
 
@@ -620,12 +626,17 @@ def get_batch():
                 # instead of a SUM we want MEAN, so we scale the loss here
                 loss = loss / grad_accum_steps
                 lossf += loss.item() # keep track of the mean loss
+            # advance the dataset for the next batch
+            if not args.overfit_single_batch:
+                x, y = next(data_iter)
+                x = x.to(device)
+                y = y.to(device)
+            # backward pass
             if ddp:
                 # we want only the last micro-step to sync grads in a DDP model
                 # the official way to do this is with model.no_sync(), but that is a
                 # context manager that bloats the code, so we just toggle this variable
                 model.require_backward_grad_sync = (micro_step == grad_accum_steps - 1)
-            # backward pass
             if not args.inference_only:
                 loss.backward()
         norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
@@ -641,7 +652,7 @@ def get_batch():
         t1 = time.time()
         # the 0th iteration is often an outlier (much slower) => skip logging it
         tokens_per_second = grad_accum_steps * ddp_world_size * B * T / (t1-t0)
-        print0(f"iteration {step+1}, loss: {lossf:.4f}, time: {(t1-t0)*1000:.3f}ms, tok/s: {tokens_per_second:.2f}, norm: {norm:.3f}")
+        print0(f"step {step+1:4d}/{args.num_iterations}: train loss {lossf:.6f} norm {norm:.4f} lr 1.00e-04 ({(t1-t0)*1000:.3f} ms, {tokens_per_second:.0f} tok/s)")
         if step > 0 and step > args.num_iterations - 20:
             timings.append(t1-t0)
 

From dee4e42548fa7e14ddd16f45479067300173fd0e Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Fri, 24 May 2024 03:47:07 +0000
Subject: [PATCH 157/172] add option to not run hellaswag, interferes with a
 bunch of testing, e.g. if T is low

---
 train_gpt2.cu | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 54a0c4fff..81f82f7b4 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2688,6 +2688,7 @@ void error_usage() {
     fprintf(stderr, "  -w <int>    keep f32 copy of weights for the optimizer? (default: 1)\n");
     fprintf(stderr, "  -z <int>    zero_stage, Zero Optimization Stage, 0,1,2,3 (default = 0)\n");
     fprintf(stderr, "  -r <int>    recompute: saves memory at cost of speed. (default = 1), 0 = none. 1 = recompute gelu\n");
+    fprintf(stderr, "  -h <int>    hellaswag eval run? (default = 0)\n");
     exit(EXIT_FAILURE);
 }
 
@@ -2719,6 +2720,7 @@ int main(int argc, char *argv[]) {
     int recompute = 1; // recompute during backward setting, 0 = none, 1 = recompute gelu
     int zero_stage = 0; // Zero Optimization Stage for Multi-GPU training
     float grad_clip  = 1.0f;
+    int hellaswag_eval = 0;
     for (int i = 1; i < argc; i+=2) {
         if (i + 1 >= argc) { error_usage(); } // must have arg after flag
         if (argv[i][0] != '-') { error_usage(); } // must start with dash
@@ -2746,6 +2748,7 @@ int main(int argc, char *argv[]) {
         else if (argv[i][1] == 'c') { grad_clip = atof(argv[i+1]); }
         else if (argv[i][1] == 'z') { zero_stage = atoi(argv[i+1]); }
         else if (argv[i][1] == 'r') { recompute = atoi(argv[i+1]); }
+        else if (argv[i][1] == 'h') { hellaswag_eval = atoi(argv[i+1]); }
         else { error_usage(); }
     }
     // should do a bit more error checking here
@@ -2832,10 +2835,11 @@ int main(int argc, char *argv[]) {
     EvalLoader eval_loader;
     const char* hellaswag_path = "dev/data/hellaswag/hellaswag_val.bin";
     const char hellaswag_available = access(hellaswag_path, F_OK) == 0;
-    if (hellaswag_available) {
+    const char run_hellaswag = hellaswag_eval && hellaswag_available;
+    if (run_hellaswag) {
         evalloader_init(&eval_loader, hellaswag_path, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes);
     }
-    printf0("| hellaswag available   | %-50s |\n", hellaswag_available ? "yes" : "no");
+    printf0("| run hellaswag         | %-50s |\n", run_hellaswag ? "yes" : "no");
     printf0("+-----------------------+----------------------------------------------------+\n");
 
     // pretty print in a table the multi-gpu configuration as well
@@ -2847,7 +2851,7 @@ int main(int argc, char *argv[]) {
     // prints outside of pretty table to here and below
     if (!hellaswag_available) {
         printf0("HellaSwag eval not found at %s, skipping its evaluation\n", hellaswag_path);
-        printf0("You can run `python dev/data/hellaswag.py` to export and use it.\n");
+        printf0("You can run `python dev/data/hellaswag.py` to export and use it with `-h 1`.\n");
     }
     // more prints related to allocations from gpt2_build_from_checkpoint down here to not mess up our table above
     printf0("num_parameters: %zu => bytes: %zu\n", model.num_parameters, model.num_parameters_bytes);
@@ -2904,7 +2908,7 @@ int main(int argc, char *argv[]) {
         }
 
         // once in a while estimate HellaSwag accuracy
-        if (hellaswag_available &&
+        if (run_hellaswag &&
            ((step > 0 && step % val_loss_every == 0) || last_step)) {
             NvtxRange evaluation_range("evaluation");
             float eval_acc_norm = 0.0f;
@@ -3045,7 +3049,7 @@ int main(int argc, char *argv[]) {
     // free and destroy everything
     cudaCheck(cudaEventDestroy(end));
     cudaCheck(cudaEventDestroy(start));
-    if (hellaswag_available) { evalloader_free(&eval_loader); }
+    if (run_hellaswag) { evalloader_free(&eval_loader); }
     dataloader_free(&train_loader);
     dataloader_free(&val_loader);
     tokenizer_free(&tokenizer);

From b66eb66ac76ab9fc65466017577121230d91741e Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Fri, 24 May 2024 17:21:53 +0300
Subject: [PATCH 158/172] update fine-web preprocessing: faster, and much less
 RAM consumption

---
 dev/data/data_common.py | 13 ++++++----
 dev/data/fineweb.py     | 56 ++++++++++++++++++++++++-----------------
 2 files changed, 41 insertions(+), 28 deletions(-)

diff --git a/dev/data/data_common.py b/dev/data/data_common.py
index 8b8d43756..b22d2f685 100644
--- a/dev/data/data_common.py
+++ b/dev/data/data_common.py
@@ -35,11 +35,14 @@ def write_datafile(filename, toks):
     header[0] = 20240520 # magic
     header[1] = 1 # version
     header[2] = len(toks) # number of tokens after the 256*4 bytes of header (each 2 bytes as uint16)
-    # validate that no token exceeds a uint16
-    maxtok = 2**16
-    assert all(0 <= t < maxtok for t in toks), "token dictionary too large for uint16"
-    # construct the tokens numpy array
-    toks_np = np.array(toks, dtype=np.uint16)
+    # construct the tokens numpy array, if not already
+    if not isinstance(toks, np.ndarray) or not toks.dtype == np.uint16:
+        # validate that no token exceeds a uint16
+        maxtok = 2**16
+        assert all(0 <= t < maxtok for t in toks), "token dictionary too large for uint16"
+        toks_np = np.array(toks, dtype=np.uint16)
+    else:
+        toks_np = toks
     # write to file
     print(f"writing {len(toks):,} tokens to {filename}")
     with open(filename, "wb") as f:
diff --git a/dev/data/fineweb.py b/dev/data/fineweb.py
index 9b8863dac..db5a126c9 100644
--- a/dev/data/fineweb.py
+++ b/dev/data/fineweb.py
@@ -52,39 +52,49 @@
 
 # helper functions
 def tokenize(doc):
-    return enc.encode_ordinary(doc["text"])
+    # validate tokens in individual threads
+    tokens = np.array([eot] + enc.encode_ordinary(doc["text"]))
+    assert (0 <= tokens).all() and (tokens < 2**16).all(), "token dictionary too large for uint16"
+    return tokens.astype(np.uint16)
+
+# don't hog the entire system
+nprocs = max(1, os.cpu_count() - 2)
 
 # main loop write files
-with mp.Pool() as pool:
+with mp.Pool(nprocs) as pool:
     shard_index = 0
-    all_tokens = []
+    # preallocate buffer to hold current shard
+    all_tokens_np = np.empty((args.shard_size,), dtype=np.uint16)
+    token_count = 0
     progress_bar = None
-    for tokens in pool.imap(tokenize, fw):
-
-        # record the tokens and make sure to separate documents
-        all_tokens.append(eot)
-        all_tokens.extend(tokens)
-
-        # update progress bar
-        if progress_bar is None:
-            progress_bar = tqdm(total=args.shard_size, unit="tokens", desc=f"Shard {shard_index}")
-        progress_bar.update(len(tokens))
+    for tokens in pool.imap(tokenize, fw, chunksize=16):
+        # enough space to add this document fully?
+        if token_count+len(tokens) < args.shard_size:
+            all_tokens_np[token_count:token_count+len(tokens)] = tokens
+            token_count += len(tokens)
 
-        # if we reach shard_size tokens, write shard to disk
-        if len(all_tokens) >= args.shard_size:
+            # update progress bar
+            if progress_bar is None:
+                progress_bar = tqdm(total=args.shard_size, unit="tokens", desc=f"Shard {shard_index}")
+            progress_bar.update(len(tokens))
+        else:
             split = "val" if shard_index == 0 else "train"
             filename = os.path.join(DATA_CACHE_DIR, f"fineweb_{split}_{shard_index:06d}.bin")
-            write_tokens = all_tokens[:args.shard_size]
-            rest_tokens = all_tokens[args.shard_size:]
-            write_datafile(filename, write_tokens)
+
+            # split the last document
+            remainder = args.shard_size - token_count
+            progress_bar.update(remainder)
+            all_tokens_np[token_count:token_count+remainder] = tokens[:remainder]
+            write_datafile(filename, all_tokens_np)
             shard_index += 1
             progress_bar = None
-            # note: create a copy so Python can free the all_tokens memory above
-            # the list rest_tokens is expected to be very small
-            all_tokens = [t for t in rest_tokens]
+
+            # populate the next shard with the leftovers of the current doc
+            all_tokens_np[0:len(tokens)-remainder] = tokens[remainder:]
+            token_count = len(tokens)-remainder
 
     # write any remaining tokens as the last shard
-    if len(all_tokens) > 0:
+    if token_count != 0:
         split = "val" if shard_index == 0 else "train"
         filename = os.path.join(DATA_CACHE_DIR, f"fineweb_{split}_{shard_index:06d}.bin")
-        write_datafile(filename, all_tokens)
+        write_datafile(filename, all_tokens_np[:token_count])

From 08fe3eef43f92c2cc77d66a89c87b41bfc2e1a28 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Fri, 24 May 2024 15:56:05 +0000
Subject: [PATCH 159/172] change default shard size for fineweb10B

---
 dev/data/fineweb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/data/fineweb.py b/dev/data/fineweb.py
index db5a126c9..891096ec4 100644
--- a/dev/data/fineweb.py
+++ b/dev/data/fineweb.py
@@ -29,7 +29,7 @@
 # ------------------------------------------
 
 parser = argparse.ArgumentParser(description="FineWeb dataset preprocessing")
-parser.add_argument("-s", "--shard_size", type=int, default=10**9, help="Size of each shard in tokens")
+parser.add_argument("-s", "--shard_size", type=int, default=10**8, help="Size of each shard in tokens")
 args = parser.parse_args()
 
 # create the cache directory if it doesn't exist yet

From 2a736cb9e280b911fdc468704a526fb43e740a27 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Fri, 24 May 2024 19:45:41 +0300
Subject: [PATCH 160/172] fix for large batch sizes

---
 train_gpt2.cu | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 81f82f7b4..c7a9ef39f 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -1264,7 +1264,10 @@ __global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS)
                 fused_classifier_kernel5(floatX* logits, floatX* losses, floatX* probs,
                                          const float dloss, const int* targets,
                                          int B, int T, int V, int P) {
-    int idx = gridDim.x - (blockIdx.x+1); // reverse order for cache hits on matmul data
+    // note: idx is small enough that it easily fits into 32 bit;
+    // by making it a long here, we ensure that any offsets calculated with it (e.g., idx * P)
+    // are done is 64 bit
+    long idx = gridDim.x - (blockIdx.x+1); // reverse order for cache hits on matmul data
     int ix = targets[idx];
 
     // softmax (reading B * T * V, same logits read again below, hopefully still in cache)
@@ -2044,7 +2047,8 @@ void gpt2_build_from_random(GPT2 *model, int depth) {
     model->config.num_layers = depth;
     // follows GPT-2 sizes
     int channels, num_heads;
-    if      (depth == 12) { channels = 768; num_heads = 12; } // gpt2 (124M)
+    if      (depth == 6)  { channels = 384; num_heads = 6; } // gpt2-tiny (30M)
+    else if (depth == 12) { channels = 768; num_heads = 12; } // gpt2 (124M)
     else if (depth == 24) { channels = 1024; num_heads = 16; } // gpt2-medium (350M)
     else if (depth == 36) { channels = 1280; num_heads = 20; } // gpt2-large (774M)
     else if (depth == 48) { channels = 1600; num_heads = 25; } // gpt2-xl (1558M)

From 613a125216ca8b81c90f675f73f407e566d2924a Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Fri, 24 May 2024 16:49:20 +0000
Subject: [PATCH 161/172] add checkpoint function write to file

---
 train_gpt2.cu | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 81f82f7b4..546e0e2a9 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -1955,6 +1955,31 @@ typedef struct {
     int recompute;
 } GPT2;
 
+void gpt2_write_to_checkpoint(GPT2 *model, const char* checkpoint_path) {
+    // write the model to a checkpoint file
+    printf0("Writing model to %s\n", checkpoint_path);
+    FILE *model_file = fopenCheck(checkpoint_path, "wb");
+    // write the header first
+    int model_header[256];
+    model_header[0] = 20240326;
+    assert(PRECISION_MODE == PRECISION_FP32 || PRECISION_MODE == PRECISION_BF16);
+    model_header[1] = PRECISION_MODE == PRECISION_FP32 ? 3 : 5;
+    model_header[2] = model->config.max_seq_len;
+    model_header[3] = model->config.vocab_size;
+    model_header[4] = model->config.num_layers;
+    model_header[5] = model->config.num_heads;
+    model_header[6] = model->config.channels;
+    model_header[7] = model->config.padded_vocab_size;
+    fwrite(model_header, sizeof(int), 256, model_file);
+    // write the parameters
+    void* params_memory_cpu = (void*)mallocCheck(model->num_parameters_bytes);
+    cudaCheck(cudaMemcpy(params_memory_cpu, model->params_memory, model->num_parameters_bytes, cudaMemcpyDeviceToHost));
+    fwrite(params_memory_cpu, 1, model->num_parameters_bytes, model_file);
+    free(params_memory_cpu);
+    // close file, we're done
+    fcloseCheck(model_file);
+}
+
 void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
 
     if (PRECISION_MODE == PRECISION_FP16) {

From 1d10e788af6c7938b8ac58cab0bed1717f8c246f Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Fri, 24 May 2024 16:51:26 +0000
Subject: [PATCH 162/172] fix bug pretty sure this should just be a void*
 pointer

---
 train_gpt2.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 546e0e2a9..ce1648d35 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2037,7 +2037,7 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
     model->params_memory = malloc_and_point_parameters(&model->params, model->param_elements, model->param_sizeof);
 
     // read in all the parameters from file and copy them to device
-    float* params_memory_cpu = (float*)mallocCheck(model->num_parameters_bytes);
+    void* params_memory_cpu = (void*)mallocCheck(model->num_parameters_bytes);
     freadCheck(params_memory_cpu, 1, model->num_parameters_bytes, model_file);
     cudaCheck(cudaMemcpy(model->params_memory, params_memory_cpu, model->num_parameters_bytes, cudaMemcpyHostToDevice));
     free(params_memory_cpu);

From df2e0dadd2e54394e8f95a4d31ab4bef875fb3d5 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Fri, 24 May 2024 19:59:53 +0300
Subject: [PATCH 163/172] int64_t

---
 train_gpt2.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index c7a9ef39f..77f2e6eb4 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -1210,7 +1210,7 @@ struct SoftmaxParams {
     float Offset;
 };
 
-__device__ SoftmaxParams prepare_softmax_blockwide3(int idx, const floatX* inp, int V, int P) {
+__device__ SoftmaxParams prepare_softmax_blockwide3(int64_t idx, const floatX* inp, int V, int P) {
     // same but not float4
     // one row of inp, i.e. inp[idx, :] of shape (V,)
 
@@ -1267,7 +1267,7 @@ __global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS)
     // note: idx is small enough that it easily fits into 32 bit;
     // by making it a long here, we ensure that any offsets calculated with it (e.g., idx * P)
     // are done is 64 bit
-    long idx = gridDim.x - (blockIdx.x+1); // reverse order for cache hits on matmul data
+    int64_t idx = gridDim.x - (blockIdx.x+1); // reverse order for cache hits on matmul data
     int ix = targets[idx];
 
     // softmax (reading B * T * V, same logits read again below, hopefully still in cache)

From 1b98637960581d0179188f57e645b9e8737b266d Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Fri, 24 May 2024 20:11:34 +0300
Subject: [PATCH 164/172] int -> int64_t

---
 dev/cuda/classifier_fused.cu | 62 +++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 33 deletions(-)

diff --git a/dev/cuda/classifier_fused.cu b/dev/cuda/classifier_fused.cu
index 092de5955..2125b874d 100644
--- a/dev/cuda/classifier_fused.cu
+++ b/dev/cuda/classifier_fused.cu
@@ -38,7 +38,7 @@ typedef Packed128<floatX> x128;
 void softmax_forward_cpu(float* out, const float* inp, int N, int C) {
     // inp is (N, C)
     // out is (N, C), each row of inp will get softmaxed
-    for (int i = 0; i < N; i++) {
+    for (int64_t i = 0; i < N; i++) {
         const float* inp_row = inp + i * C;
         float* out_row = out + i * C;
 
@@ -66,13 +66,11 @@ void crossentropy_forward_cpu(float* losses,
     // output: losses is (B,T) of the individual losses at each position
     // input: probs are (B,T,V) of the probabilities
     // input: targets is (B,T) of integers giving the correct index in logits
-    for (int b = 0; b < B; b++) {
-        for (int t = 0; t < T; t++) {
-            // loss = -log(probs[target])
-            const float* probs_bt = probs + b * T * V + t * V;
-            int ix = targets[b * T + t];
-            losses[b * T + t] = -logf(probs_bt[ix]);
-        }
+    for (int64_t bt = 0; bt < B * T; bt++) {
+        // loss = -log(probs[target])
+        const float* probs_bt = probs + bt * V;
+        int ix = targets[bt];
+        losses[bt] = -logf(probs_bt[ix]);
     }
 }
 
@@ -80,17 +78,15 @@ void crossentropy_softmax_backward_cpu(float* dlogits,
                                        const float* dlosses, const float* probs, const int* targets,
                                        int B, int T, int V) {
     // backwards through both softmax and crossentropy
-    for (int b = 0; b < B; b++) {
-        for (int t = 0; t < T; t++) {
-            float* dlogits_bt = dlogits + b * T * V + t * V;
-            const float* probs_bt = probs + b * T * V + t * V;
-            float dloss = dlosses[b * T + t];
-            int ix = targets[b * T + t];
-            for (int i = 0; i < V; i++) {
-                float p = probs_bt[i];
-                float indicator = i == ix ? 1.0f : 0.0f;
-                dlogits_bt[i] = (p - indicator) * dloss;
-            }
+    for (int64_t bt = 0; bt < B * T; bt++) {
+        float* dlogits_bt = dlogits + bt * V;
+        const float* probs_bt = probs + bt * V;
+        float dloss = dlosses[bt];
+        int ix = targets[bt];
+        for (int i = 0; i < V; i++) {
+            float p = probs_bt[i];
+            float indicator = i == ix ? 1.0f : 0.0f;
+            dlogits_bt[i] = (p - indicator) * dloss;
         }
     }
 }
@@ -115,7 +111,7 @@ struct SoftmaxParams {
 };
 namespace cg = cooperative_groups;
 __device__ SoftmaxParams prepare_softmax(cg::thread_block_tile<32>& warp,
-                                         int idx, const float* inp, int V, int P) {
+                                         int64_t idx, const float* inp, int V, int P) {
     // this warp (of 32) threads processes one row of inp, i.e. inp[idx, :] of shape (V,)
     // note that inp is actually (B * T, P) but we only use the first V elements
     // this function tehen calculates:
@@ -155,7 +151,7 @@ __global__ void fused_classifier_kernel1(float* dlogits, float* losses,
     // each block of 4 warps is in charge of 4 rows of the input, one warp per row
     // meta_group_size is the number of warps per block (e.g. 4)
     // meta_group_rank is the index of the warp in the block (e.g. 0, 1, 2, 3)
-    int idx = blockIdx.x * warp.meta_group_size() + warp.meta_group_rank();
+    int64_t idx = blockIdx.x * warp.meta_group_size() + warp.meta_group_rank();
     if (idx >= B * T) { // there are B * T rows in the input
         return;
     }
@@ -192,7 +188,7 @@ __device__ float vec_at(const float4& vec, int index) {
 }
 
 __device__ SoftmaxParams prepare_softmax_blockwide(cg::thread_block_tile<32>& warp,
-                                                   int idx, const float* inp, int V, int P) {
+                                                   int64_t idx, const float* inp, int V, int P) {
     // one row of inp, i.e. inp[idx, :] of shape (V,)
     // float4 to get 128-bit loads and memory level parallelism
     const float4* x_vec4 = reinterpret_cast<const float4*>(inp + idx * P);
@@ -256,7 +252,7 @@ __global__ void fused_classifier_kernel2(float* dlogits, float* losses, float* p
     namespace cg = cooperative_groups;
     cg::thread_block block = cg::this_thread_block();
     cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
-    int idx = blockIdx.x;
+    int64_t idx = blockIdx.x;
     int ix = targets[idx];
 
     // softmax (reading B * T * V, same logits read again below, hopefully still in cache)
@@ -297,7 +293,7 @@ __global__ void fused_classifier_kernel2(float* dlogits, float* losses, float* p
 }
 
 __device__ SoftmaxParams prepare_softmax_blockwide_nofloat4(cg::thread_block_tile<32>& warp,
-                                                   int idx, const float* inp, int V, int P) {
+                                                            int64_t idx, const float* inp, int V, int P) {
     // same but not float4
     // one row of inp, i.e. inp[idx, :] of shape (V,)
 
@@ -353,7 +349,7 @@ __global__ void fused_classifier_kernel3(float* dlogits, float* losses, float* p
     namespace cg = cooperative_groups;
     cg::thread_block block = cg::this_thread_block();
     cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
-    int idx = blockIdx.x;
+    int64_t idx = blockIdx.x;
     int ix = targets[idx];
 
     // softmax (reading B * T * V, same logits read again below, hopefully still in cache)
@@ -385,7 +381,7 @@ __global__ void fused_classifier_kernel3(float* dlogits, float* losses, float* p
     }
 }
 
-__device__ SoftmaxParams prepare_softmax_blockwide2(int idx, const floatX* inp, int V, int P) {
+__device__ SoftmaxParams prepare_softmax_blockwide2(int64_t idx, const floatX* inp, int V, int P) {
     // one row of inp, i.e. inp[idx, :] of shape (V,)
 
     const floatX* x = inp + idx * P;
@@ -443,7 +439,7 @@ __device__ SoftmaxParams prepare_softmax_blockwide2(int idx, const floatX* inp,
 __global__ void fused_classifier_kernel4(floatX* dlogits, floatX* losses, floatX* probs,
                                          const floatX* logits, const floatX* dlosses, const int* targets,
                                          int B, int T, int V, int P) {
-    int idx = blockIdx.x;
+    int64_t idx = blockIdx.x;
     int ix = targets[idx];
 
     // softmax (reading B * T * V, same logits read again below, hopefully still in cache)
@@ -512,7 +508,7 @@ __device__ float blockReduce(float val, bool final_sync=false, float out_of_boun
     return block_val;
 }
 
-__device__ SoftmaxParams prepare_softmax_blockwide3(int idx, const floatX* inp, int V, int P) {
+__device__ SoftmaxParams prepare_softmax_blockwide3(int64_t idx, const floatX* inp, int V, int P) {
     // same but not float4
     // one row of inp, i.e. inp[idx, :] of shape (V,)
 
@@ -566,7 +562,7 @@ __global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS)
                 fused_classifier_kernel5(floatX* dlogits, floatX* losses, floatX* probs,
                                          const floatX* logits, const floatX* dlosses, const int* targets,
                                          int B, int T, int V, int P) {
-    int idx = blockIdx.x;
+    int64_t idx = blockIdx.x;
     int ix = targets[idx];
 
     // softmax (reading B * T * V, same logits read again below, hopefully still in cache)
@@ -702,10 +698,10 @@ void fused_classifier(int kernel_num, float* dlogits, float* losses,
 int main(int argc, char **argv) {
     srand(0);
 
-    int B = 8;              // batch size
-    int T = 1024;           // sequence length
-    int V = 50257;          // vocab size
-    int P = (V + 63) & ~63; // padded vocab size, up to nearest multiple of 64
+    int64_t B = 8;              // batch size
+    int64_t T = 1024;           // sequence length
+    int64_t V = 50257;          // vocab size
+    int64_t P = (V + 63) & ~63; // padded vocab size, up to nearest multiple of 64
 
     int deviceIdx = 0;
     cudaCheck(cudaSetDevice(deviceIdx));

From 16b364d256e7755bb7e99f4238d35d95422e9371 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Fri, 24 May 2024 17:57:50 +0000
Subject: [PATCH 165/172] change -o to be not file but directory, and create a
 main.log inside it for the log. this is to prepare for writing of
 checkpoints, where we want to re-use this directory for those as well

---
 train_gpt2.cu | 43 +++++++++++++++++++++++++++++++++----------
 1 file changed, 33 insertions(+), 10 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index ce1648d35..f704d1064 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -36,8 +36,11 @@ This reads & runs in fp32, B=4, T=64, LR=1e-4, val/sample never (200),
 
 #include <unistd.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <stdarg.h>
 #include <string>
+#include <sys/stat.h>
+#include <sys/types.h>
 // GPU / CUDA related
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
@@ -2649,17 +2652,30 @@ int sample_softmax(const float* logits, int n, float coin) {
 // ----------------------------------------------------------------------------
 // Logger lite, will probably grow/change some over time
 
+void create_dir_if_not_exists(const char *dir) {
+    struct stat st = {0};
+    if (stat(dir, &st) == -1) {
+        if (mkdir(dir, 0700) == -1) {
+            printf0("ERROR: could not create directory: %s\n", dir);
+            exit(EXIT_FAILURE);
+        }
+        printf0("created directory: %s\n", dir);
+    }
+}
+
 typedef struct {
     FILE *logfile;
     int flush_every; // every how many steps to flush the log
 } Logger;
 
-void logger_init(Logger *logger, const char *filename) {
+void logger_init(Logger *logger, const char *log_dir, int process_rank) {
     logger->flush_every = 10;
     logger->logfile = NULL;
-    // only rank 0 process will log
-    if (filename != NULL && multi_gpu_config.process_rank == 0) {
-        logger->logfile = fopenCheck(filename, "w");
+    if (log_dir != NULL && process_rank == 0) {
+        char output_log_file[256];
+        assert(strlen(log_dir) < 200); // being a bit lazy, can relax later maybe
+        snprintf(output_log_file, 256, "%s/main.log", log_dir);
+        logger->logfile = fopenCheck(output_log_file, "w");
     }
 }
 
@@ -2695,7 +2711,7 @@ void error_usage() {
     fprintf(stderr, "  -i <string> train data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_train.bin)\n");
     fprintf(stderr, "  -j <string> val data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_val.bin)\n");
     fprintf(stderr, "  -e <string> input from model at this filename (default = gpt2_124M_bf16.bin)\n");
-    fprintf(stderr, "  -o <string> output log file (default = NULL)\n");
+    fprintf(stderr, "  -o <string> output log dir (default = NULL, no logging)\n");
     fprintf(stderr, "  -b <int>    (per-GPU, micro) batch size B (default = 4)\n");
     fprintf(stderr, "  -t <int>    sequence length T (default = 1024)\n");
     fprintf(stderr, "  -d <int>    total desired batch size (default = B * T * num_processes, i.e. no grad accumulation\n");
@@ -2726,7 +2742,7 @@ int main(int argc, char *argv[]) {
     const char* train_data_pattern = "dev/data/tinyshakespeare/tiny_shakespeare_train.bin";
     const char* val_data_pattern = "dev/data/tinyshakespeare/tiny_shakespeare_val.bin";
     const char* load_filename = "gpt2_124M_bf16.bin"; // bf16 weights of the model
-    const char* output_log_file = NULL;
+    const char* output_log_dir = NULL;
     int B = 4; // batch size
     int T = 1024; // sequence length max
     int total_batch_size = -1; // will be calculated down below later, if not provided
@@ -2754,7 +2770,7 @@ int main(int argc, char *argv[]) {
         if (argv[i][1] == 'i') { train_data_pattern = argv[i+1]; }
         else if (argv[i][1] == 'j') { val_data_pattern = argv[i+1]; }
         else if (argv[i][1] == 'e') { load_filename = argv[i+1]; }
-        else if (argv[i][1] == 'o') { output_log_file = argv[i+1]; }
+        else if (argv[i][1] == 'o') { output_log_dir = argv[i+1]; }
         else if (argv[i][1] == 'b') { B = atoi(argv[i+1]); } // Per-GPU (micro) batch size
         else if (argv[i][1] == 't') { T = atoi(argv[i+1]); }
         else if (argv[i][1] == 'd') { total_batch_size = atoi(argv[i+1]); }
@@ -2778,6 +2794,12 @@ int main(int argc, char *argv[]) {
     }
     // should do a bit more error checking here
     assert(warmup_iterations >= 0);
+    // check if output_log_dir has a "." in it, because this behavior changed May 24, 2024. take out later
+    if (output_log_dir != NULL && strstr(output_log_dir, ".") != NULL) {
+        fprintf(stderr, "-o (output_log_dir) has a '.', are you specifying a file instead of dir?\n");
+        fprintf(stderr, "(note that this option changed recently, -o used to be file, became dir.)\n");
+        exit(EXIT_FAILURE);
+    }
     // calculate a sensible default for total batch size by assuming no gradient accumulation
     if (total_batch_size == -1) { total_batch_size = B * T * multi_gpu_config.num_processes; }
     // if we're only overfitting a single batch for debugging, let's overfit the first batch
@@ -2788,7 +2810,7 @@ int main(int argc, char *argv[]) {
     printf0("+-----------------------+----------------------------------------------------+\n");
     printf0("| train data pattern    | %-50s |\n", train_data_pattern);
     printf0("| val data pattern      | %-50s |\n", val_data_pattern);
-    printf0("| output log file       | %-50s |\n", output_log_file == NULL ? "NULL" : output_log_file);
+    printf0("| output log dir        | %-50s |\n", output_log_dir == NULL ? "NULL" : output_log_dir);
     printf0("| micro batch size B    | %-50d |\n", B);
     printf0("| sequence length T     | %-50d |\n", T);
     printf0("| total batch size      | %-50d |\n", total_batch_size);
@@ -2890,9 +2912,10 @@ int main(int argc, char *argv[]) {
             B, T, multi_gpu_config.num_processes, total_batch_size);
     printf0("=> setting grad_accum_steps=%d\n", grad_accum_steps);
 
-    // set up the Logger
+    // set up logging
+    create_dir_if_not_exists(output_log_dir);
     Logger logger;
-    logger_init(&logger, output_log_file);
+    logger_init(&logger, output_log_dir, multi_gpu_config.process_rank);
 
     // set up the Tokenizer
     Tokenizer tokenizer;

From 25f17e6748d806b0de48ee2aadcb6e47a4be0449 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Fri, 24 May 2024 21:53:00 +0000
Subject: [PATCH 166/172] small formatting fix before merge

---
 train_gpt2.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 3530edf8a..936bfa8fd 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -986,10 +986,10 @@ __global__ void reduce_add_sum_kernel(floatX* dst, const float* src, size_t n, s
 }
 
 __global__ void __launch_bounds__(512, 2) // todo - any warnings on Turing with only 1024 threads?
-                layernorm_backward_kernel9(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
-                                            const floatX* dout, const floatX* inp, const floatX* weight,
-                                            const floatX* mean, const floatX* rstd,
-                                            int B, int T, int C) {
+    layernorm_backward_kernel9(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
+                                const floatX* dout, const floatX* inp, const floatX* weight,
+                                const floatX* mean, const floatX* rstd,
+                                int B, int T, int C) {
     extern __shared__ float shared[]; // size = 2*C + 2*block_size + 1
     int warpsInBlock = blockDim.x / WARP_SIZE; //number of warps in block
     int warpId = threadIdx.x / WARP_SIZE; // warp index within a block

From 3221e4b2d29e90aa78a67f457dcb57143009b94d Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Fri, 24 May 2024 23:10:17 +0000
Subject: [PATCH 167/172] small cosmetic changes

---
 train_gpt2.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/train_gpt2.c b/train_gpt2.c
index 775b2b8d8..b01abf09f 100644
--- a/train_gpt2.c
+++ b/train_gpt2.c
@@ -160,13 +160,13 @@ void layernorm_backward(float* dinp, float* dweight, float* dbias,
     }
 }
 
-void matmul_forward_slow(float* out,
+void matmul_forward_naive(float* out,
                          const float* inp, const float* weight, const float* bias,
                          int B, int T, int C, int OC) {
-    // basic implementation of matrix multiplication. This serves as a fallback
-    // for bad input shapes, and as an illustration for the most basic version
-    // of the algorithm.
-#pragma omp parallel for collapse(2)
+    // the most naive implementation of matrix multiplication
+    // this serves as an algorithmic reference, and as a fallback for
+    // unfriendly input shapes inside matmul_forward(), below.
+    #pragma omp parallel for collapse(2)
     for (int b = 0; b < B; b++) {
         for (int t = 0; t < T; t++) {
             int bt = b * T + t;
@@ -185,42 +185,42 @@ void matmul_forward(float* out,
                     const float* inp, const float* weight, const float* bias,
                     int B, int T, int C, int OC) {
     // most of the running time is spent here and in matmul_backward
+    // therefore, the implementation below is very mildly optimized
+    // this function is otherwise identical to that of matmul_forward_naive()
     // OC is short for "output channels"
     // inp is (B,T,C), weight is (OC, C), bias is (OC)
     // out will be (B,T,OC)
 
-    // make sure the tiled loop will be correct, otherwise, fallback to slow version
+    // make sure the tiled loop will be correct or fallback to naive version
     const int LOOP_UNROLL = 8;
     if (B*T % LOOP_UNROLL != 0) {
-        matmul_forward_slow(out, inp, weight, bias, B, T, C, OC);
+        matmul_forward_naive(out, inp, weight, bias, B, T, C, OC);
         return;
     }
 
     // collapse the B and T loops into one and turn it into a strided loop.
     // then we can tile the inner loop, and reuse the loaded weight LOOP_UNROLL many times
-    // for significant speed-ups.
     #pragma omp parallel for
     for (int obt = 0; obt < B * T; obt += LOOP_UNROLL) {
         for (int o = 0; o < OC; o++) {
-            // keep LOOP_UNROLL many results in register, initialized by the bias term.
+            // we'll keep LOOP_UNROLL many results in registers
             float result[LOOP_UNROLL];
-            for (int ibt = 0; ibt < LOOP_UNROLL; ++ibt) {
+            // initialize the bias, if it exists
+            for (int ibt = 0; ibt < LOOP_UNROLL; ibt++) {
                 result[ibt] = (bias != NULL) ? bias[o] : 0.0f;
             }
-
             // inner loops. Because we do LOOP_UNROLL steps of inner bt, we can cache
             // the value of weight[i + o * C] and reuse it.
-            // we compile with -Ofast, so the compiler will turn the inner loop into a bunch of FMAs
+            // we compile with -Ofast, so the compiler will turn the inner loop into FMAs
             for (int i = 0; i < C; i++) {
                 float w = weight[i + o * C];
-                for (int ibt = 0; ibt < LOOP_UNROLL; ++ibt) {
+                for (int ibt = 0; ibt < LOOP_UNROLL; ibt++) {
                     int bt = obt + ibt;
                     result[ibt] += inp[bt * C + i] * w;
                 }
             }
-
             // write back results to main memory
-            for (int ibt = 0; ibt < LOOP_UNROLL; ++ibt) {
+            for (int ibt = 0; ibt < LOOP_UNROLL; ibt++) {
                 int bt = obt + ibt;
                 out[bt * OC + o] = result[ibt];
             }

From e5083be900997bc13d9d3c6894583f8159d1c65e Mon Sep 17 00:00:00 2001
From: Ross Wheeler <ross@rosswheeler.com>
Date: Fri, 24 May 2024 16:56:52 -0700
Subject: [PATCH 168/172] Moved windows glob() over to dev/unistd.h

Added header guard and changed long->int64_t in dataloader.h
---
 dataloader.h | 106 +++++++--------------------------------------------
 dev/unistd.h |  82 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 95 insertions(+), 93 deletions(-)

diff --git a/dataloader.h b/dataloader.h
index cd4caf47f..6b63c34a1 100644
--- a/dataloader.h
+++ b/dataloader.h
@@ -15,88 +15,10 @@ Implements a medium simple DataLoader for a distributed training setup.
 #include "utils.h"
 
 // ----------------------------------------------------------------------------
-// implementation of glob for Windows
+// implementation of glob for Windows is in dev/unistd.h 
 #ifndef _WIN32
 #include <glob.h>
-#else
-
-typedef struct glob_t {
-    size_t gl_pathc;    // Count of matched pathnames
-    char **gl_pathv;    // List of matched pathnames
-} glob_t;
-
-void replace_forward_slashes(char* str) {
-    while (*str) {
-        if (*str == '/') {
-            *str = '\\';
-        }
-        str++;
-    }
-}
-
-void globfree(glob_t *pglob) {
-    for (size_t i = 0; i < pglob->gl_pathc; ++i) {
-		free(pglob->gl_pathv[i]); // Free the allocated memory for each filename
-    }
-	free(pglob->gl_pathv); // Free the allocated memory for the list of filenames
-}
-
-int glob(const char* pattern, int ignored_flags, int (*ignored_errfunc)(const char* epath, int eerrno), glob_t* pglob){
-    struct _finddata_t find_file_data;
-	char full_path[576]; // stored in pglob->gl_pathv[n]
-    char directory_path[512] = {0}; // Store the directory path from the pattern
-	char pattern_copy[512]; // Copy of the pattern to modify
-
-	strncpy_s(pattern_copy, sizeof(pattern_copy) - 1, pattern, sizeof(pattern_copy) - 1);
-
-    replace_forward_slashes (pattern_copy); // Replace forward slashes with backslashes
-    
-	if (strchr(pattern_copy, '\\') != NULL) {
-		strncpy_s(directory_path, sizeof(directory_path) - 1, pattern_copy, strrchr(pattern_copy, '\\') - pattern_copy + 1);
-		directory_path[strrchr(pattern_copy, '\\') - pattern_copy + 1] = '\0';
-	}
-	
-    // find the first file matching the pattern in the directory
-    intptr_t find_handle = _findfirst(pattern_copy, &find_file_data);
-
-    if (find_handle == -1) {
-        return 1; // No files found
-    }
-
-    size_t file_count = 0;
-    size_t max_files = 64000; // hard-coded limit for the number of files
-
-	pglob->gl_pathv = (char **) malloc(max_files * sizeof(char*)); // freed in globfree
-
-    if (pglob->gl_pathv == NULL) {
-        _findclose(find_handle);
-        return 2; // Memory allocation failed
-    }
-
-    do {
-        if (file_count >= max_files) {
-            _findclose(find_handle);
-			return 2; // Too many files found
-            }
-
-        snprintf(full_path, sizeof(full_path), "%s%s", directory_path, find_file_data.name);
-
-		pglob->gl_pathv[file_count] = _strdup(full_path); // freed in globfree
-
-        if (pglob->gl_pathv[file_count] == NULL) {
-            _findclose(find_handle);
-            return 2; // Memory allocation for filename failed
-        }
-        file_count++;
-    } while (_findnext(find_handle, &find_file_data) == 0);
-
-    _findclose(find_handle);
-
-    pglob->gl_pathc = file_count;
-    return 0;
-}
 #endif
-
 // ----------------------------------------------------------------------------
 // Distributed Data Loader
 #define HEADER_SIZE 256
@@ -113,8 +35,8 @@ typedef struct {
     glob_t glob_result; // stores the result of glob, for all shards we want to iterate
     int current_shard; // the current shard we are reading from
     FILE* tokens_file;
-    long file_size;
-    long current_position;
+    int64_t file_size;
+    int64_t current_position;
     uint16_t* buffer; // we fread data from file into this buffer
     // public variables that could be accessed from outside
     size_t num_batches;
@@ -122,7 +44,7 @@ typedef struct {
     int* targets; // target tokens for the transformer
 } DataLoader;
 
-long dataloader_load_shard_(DataLoader *loader, int shard_index) {
+int64_t dataloader_load_shard_(DataLoader *loader, int shard_index) {
     // use the first glob match as the filename for now
     const char* filename = loader->glob_result.gl_pathv[shard_index];
     // open the input file for reading. also only a single file can be opened at a time
@@ -140,14 +62,14 @@ long dataloader_load_shard_(DataLoader *loader, int shard_index) {
         exit(EXIT_FAILURE);
     }
     if (header[1] != 1) { printf("Bad version in data file\n"); exit(EXIT_FAILURE); }
-    long ntok = header[2]; // number of tokens in the file
+    int64_t ntok = header[2]; // number of tokens in the file
     assert(ntok > 0); // we expect some tokens in the file. this should never trip, right?
     // determine the file size and make sure it is consistent with the number of tokens
     fseekCheck(loader->tokens_file, 0, SEEK_END); // seek to end of file
     loader->file_size = ftell(loader->tokens_file); // read the offset, i.e. file size
     fseekCheck(loader->tokens_file, 0, SEEK_SET); // seek back to the beginning
     // we expect ntok in the file to be consistent with filesize, assert that is the case
-    long expected_file_size = HEADER_SIZE * sizeof(int) + ntok * sizeof(uint16_t);
+    int64_t expected_file_size = HEADER_SIZE * sizeof(int) + ntok * sizeof(uint16_t);
     if (loader->file_size != expected_file_size) {
         printf("Error: file size is not as expected\n");
         exit(EXIT_FAILURE);
@@ -158,8 +80,8 @@ long dataloader_load_shard_(DataLoader *loader, int shard_index) {
 void dataloader_reset(DataLoader *loader) {
     // fully resets the DataLoader object to init configuration
     // each process starts at a different offset in the file
-    long header_bytes = HEADER_SIZE * sizeof(int);
-    long token_bytes_offset = loader->process_rank * loader->B * loader->T * sizeof(uint16_t);
+    int64_t header_bytes = HEADER_SIZE * sizeof(int);
+    int64_t token_bytes_offset = loader->process_rank * loader->B * loader->T * sizeof(uint16_t);
     loader->current_shard = 0;
     loader->current_position = header_bytes + token_bytes_offset;
     dataloader_load_shard_(loader, loader->current_shard);
@@ -172,8 +94,8 @@ void dataloader_advance_(DataLoader *loader) {
         loader->current_shard = (loader->current_shard + 1) % loader->glob_result.gl_pathc;
         dataloader_load_shard_(loader, loader->current_shard);
     }
-    long header_bytes = HEADER_SIZE * sizeof(int);
-    long token_bytes_offset = loader->process_rank * loader->B * loader->T * sizeof(uint16_t);
+    int64_t header_bytes = HEADER_SIZE * sizeof(int);
+    int64_t token_bytes_offset = loader->process_rank * loader->B * loader->T * sizeof(uint16_t);
     loader->current_position = header_bytes + token_bytes_offset;
 }
 
@@ -202,9 +124,9 @@ void dataloader_init(DataLoader *loader,
 
     // inspect and validate all shards so we don't get any runtime errors later
     // if too slow / too many shards, may wish to revisit later
-    long ntok_total = 0;
+    int64_t ntok_total = 0;
     for (int shard_index = 0; shard_index < loader->glob_result.gl_pathc; shard_index++) {
-        long shard_ntok = dataloader_load_shard_(loader, shard_index);
+        int64_t shard_ntok = dataloader_load_shard_(loader, shard_index);
         // we need at least one batch/shard, the way things are written right now.
         // can be relaxed a lot later.
         assert(shard_ntok >= num_processes * B * T + 1);
@@ -286,7 +208,7 @@ typedef struct {
     size_t T; // maximum context length of the model
     // input handling and its state
     FILE* eval_file;
-    long file_size;
+    int64_t file_size;
     uint16_t* buffer; // we fread data from file into this buffer
     // public variables that could be accessed from outside
     int num_examples; // in total across all processes
@@ -318,7 +240,7 @@ void evalloader_reset(EvalLoader *loader) {
     }
     // now seek through the file to the start of that example
     // utilize <EXAMPLE_BYTES> for efficiency
-    long header_bytes = HEADER_SIZE * sizeof(int);
+    int64_t header_bytes = HEADER_SIZE * sizeof(int);
     fseekCheck(loader->eval_file, header_bytes, SEEK_SET);
     for (int i = 0; i < loader->start_example_index; i++) {
         uint16_t example_header[3];
diff --git a/dev/unistd.h b/dev/unistd.h
index 18efc2206..348bbae0a 100644
--- a/dev/unistd.h
+++ b/dev/unistd.h
@@ -5,12 +5,14 @@
 #define _CRT_SECURE_NO_WARNINGS
 #define _USE_MATH_DEFINES
 
+#include <stdio.h>
+
 #include <math.h>
 //#define gen_max_length 64 // compile as C++ to skip this VLA issue
 #include <time.h>
 
 #define CLOCK_MONOTONIC 0
-int clock_gettime(int ignore_variable, struct timespec* tv)
+static inline int clock_gettime(int ignore_variable, struct timespec* tv)
 {
     return timespec_get(tv, TIME_UTC); // TODO: not sure this is the best solution. Need to review.
 }
@@ -23,4 +25,82 @@ int clock_gettime(int ignore_variable, struct timespec* tv)
 #define TURN_OFF_FP_FAST __pragma(float_control( precise, on, push )) // Save current setting and turn on /fp:precise
 #define TURN_ON_FP_FAST  __pragma(float_control(pop)) // Restore file's default settings
 
+#define _mkdir mkdir // add mkdir into namespace for windows
+
+typedef struct glob_t {
+    size_t gl_pathc;    // Count of matched pathnames
+    char **gl_pathv;    // List of matched pathnames
+} glob_t;
+
+static inline void replace_forward_slashes(char* str) {
+    while (*str) {
+        if (*str == '/') {
+            *str = '\\';
+        }
+        str++;
+    }
+}
+
+static inline void globfree(glob_t *pglob) {
+    for (size_t i = 0; i < pglob->gl_pathc; ++i) {
+        free(pglob->gl_pathv[i]); // Free the allocated memory for each filename
+    }
+    free(pglob->gl_pathv); // Free the allocated memory for the list of filenames
+}
+
+static inline int glob(const char* pattern, int ignored_flags, int (*ignored_errfunc)(const char* epath, int eerrno), glob_t* pglob){
+    struct _finddata_t find_file_data;
+    char full_path[576]; // stored in pglob->gl_pathv[n]
+    char directory_path[512] = {0}; // Store the directory path from the pattern
+    char pattern_copy[512]; // Copy of the pattern to modify
+
+    strncpy_s(pattern_copy, sizeof(pattern_copy) - 1, pattern, sizeof(pattern_copy) - 1);
+
+    replace_forward_slashes (pattern_copy); // Replace forward slashes with backslashes
+    
+    if (strchr(pattern_copy, '\\') != NULL) {
+        strncpy_s(directory_path, sizeof(directory_path) - 1, pattern_copy, strrchr(pattern_copy, '\\') - pattern_copy + 1);
+        directory_path[strrchr(pattern_copy, '\\') - pattern_copy + 1] = '\0';
+    }
+
+    // find the first file matching the pattern in the directory
+    intptr_t find_handle = _findfirst(pattern_copy, &find_file_data);
+
+    if (find_handle == -1) {
+        return 1; // No files found
+    }
+
+    size_t file_count = 0;
+    size_t max_files = 64000; // hard-coded limit for the number of files
+
+    pglob->gl_pathv = (char **) malloc(max_files * sizeof(char*)); // freed in globfree
+
+    if (pglob->gl_pathv == NULL) {
+        _findclose(find_handle);
+        return 2; // Memory allocation failed
+    }
+
+    do {
+        if (file_count >= max_files) {
+            _findclose(find_handle);
+            return 2; // Too many files found
+            }
+
+        snprintf(full_path, sizeof(full_path), "%s%s", directory_path, find_file_data.name);
+
+        pglob->gl_pathv[file_count] = _strdup(full_path); // freed in globfree
+
+        if (pglob->gl_pathv[file_count] == NULL) {
+            _findclose(find_handle);
+            return 2; // Memory allocation for filename failed
+        }
+        file_count++;
+    } while (_findnext(find_handle, &find_file_data) == 0);
+
+    _findclose(find_handle);
+
+    pglob->gl_pathc = file_count;
+    return 0;
+}
+
 #endif

From 79738d2ca4786eef178a82eec62249d976c3b86b Mon Sep 17 00:00:00 2001
From: Ross Wheeler <ross@rosswheeler.com>
Date: Fri, 24 May 2024 17:05:59 -0700
Subject: [PATCH 169/172] fixed mkdir change

---
 dev/unistd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/unistd.h b/dev/unistd.h
index 348bbae0a..861041d46 100644
--- a/dev/unistd.h
+++ b/dev/unistd.h
@@ -25,7 +25,7 @@ static inline int clock_gettime(int ignore_variable, struct timespec* tv)
 #define TURN_OFF_FP_FAST __pragma(float_control( precise, on, push )) // Save current setting and turn on /fp:precise
 #define TURN_ON_FP_FAST  __pragma(float_control(pop)) // Restore file's default settings
 
-#define _mkdir mkdir // add mkdir into namespace for windows
+#define mkdir _mkdir // add mkdir into namespace for windows
 
 typedef struct glob_t {
     size_t gl_pathc;    // Count of matched pathnames

From 9f08882051426ac5d0fe30db283b195c20dad34b Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sat, 25 May 2024 00:14:10 +0000
Subject: [PATCH 170/172] add weight decay, but only for 2D tensors, as done in
 GPT series and in general too. this forces us to break up our adamw kernel
 again into one call per tensor, so there is a small throughput hit, of about
 0.5% for me. but we have to break up this kernel in near future anyway

---
 train_gpt2.cu | 30 +++++++++++++++++++++++++-----
 train_gpt2.py | 35 ++++++++++++++++++++++++++++++++---
 2 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 26d29927b..3e8964690 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2711,14 +2711,34 @@ float gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, fl
 
     // AdamW update
     int block_size = 512;
-    int num_blocks = CEIL_DIV(num_parameters, block_size);
     float beta1_correction = 1.0f - powf(beta1, t);
     float beta2_correction = 1.0f - powf(beta2, t);
     unsigned int seed = random_u32(&model->rng_state);
-    adamw_kernel3<<<num_blocks, block_size>>>(params_memory, model->master_weights, grads_memory,
-                                              model->m_memory, model->v_memory, num_parameters,
-                                              learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay,
-                                              grad_scale, seed);
+
+    // individually call the adamw_kernel3 on all parameter tensors separately
+    floatX* params_memory_iter = params_memory;
+    float* master_weights_iter = model->master_weights;
+    floatX* grads_memory_iter = grads_memory;
+    float* m_memory_iter = (float*)model->m_memory;
+    float* v_memory_iter = (float*)model->v_memory;
+    for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
+        size_t num_parameters = model->param_elements[i];
+        int num_blocks = CEIL_DIV(num_parameters, block_size);
+        // we only want to weight decay the 2D tensors and leave all 1D tensors alone
+        // in particular this also decays the embedding weights, but this is ok:
+        // - the token embeddings are weight shared and participate in the final projection to logits
+        // - the position embeddings actively participate at every forward/backward pass
+        float wd = (i == 0 || i == 1 || i == 4 || i == 6 || i == 10 || i == 12) ? weight_decay : 0.0f;
+        adamw_kernel3<<<num_blocks, block_size>>>(params_memory_iter, master_weights_iter, grads_memory_iter,
+                                                  m_memory_iter, v_memory_iter, num_parameters,
+                                                  learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, wd,
+                                                  grad_scale, seed);
+        params_memory_iter += num_parameters;
+        if (master_weights_iter != NULL) { master_weights_iter += num_parameters; }
+        grads_memory_iter += num_parameters;
+        m_memory_iter += num_parameters;
+        v_memory_iter += num_parameters;
+    }
     cudaCheck(cudaGetLastError());
     return grad_norm_cpu;
 }
diff --git a/train_gpt2.py b/train_gpt2.py
index d32ab5042..f2b0ae302 100644
--- a/train_gpt2.py
+++ b/train_gpt2.py
@@ -19,6 +19,7 @@
 import os
 import math
 import struct
+import inspect
 from contextlib import nullcontext
 from dataclasses import dataclass
 
@@ -228,6 +229,31 @@ def from_pretrained(cls, model_type):
 
         return model
 
+    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
+        # start with all of the candidate parameters
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        # filter out those that do not require grad
+        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
+        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
+        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
+        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
+        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
+        optim_groups = [
+            {'params': decay_params, 'weight_decay': weight_decay},
+            {'params': nodecay_params, 'weight_decay': 0.0}
+        ]
+        num_decay_params = sum(p.numel() for p in decay_params)
+        num_nodecay_params = sum(p.numel() for p in nodecay_params)
+        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
+        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
+        # Create AdamW optimizer and use the fused version if it is available
+        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
+        use_fused = fused_available and device_type == 'cuda'
+        extra_args = dict(fused=True) if use_fused else dict()
+        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
+        print(f"using fused AdamW: {use_fused}")
+        return optimizer
+
     @torch.no_grad()
     def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
         """
@@ -429,6 +455,7 @@ def print0(*args, **kwargs):
     parser.add_argument("--sequence_length", type=int, default=64, help="sequence length")
     parser.add_argument("--total_batch_size", type=int, default=256, help="total desired batch size, in units of #tokens")
     parser.add_argument("--grad_clip", type=float, default=1.0, help="maximum gradient magnitude")
+    parser.add_argument("--weight_decay", type=float, default=0.0, help="weight decay")
     parser.add_argument("--overfit_single_batch", type=int, default=1, help="overfit just one batch of data")
     args = parser.parse_args()
     B, T = args.batch_size, args.sequence_length
@@ -467,6 +494,7 @@ def print0(*args, **kwargs):
             elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
                 device = "mps"
     print(f"using device: {device}")
+    device_type = 'cuda' if 'cuda' in device else 'cpu'
 
     # calculate gradient accumulation from the desired total batch size and the current run configuration
     tokens_per_fwdbwd = B * T * ddp_world_size
@@ -478,7 +506,7 @@ def print0(*args, **kwargs):
 
     # set up a context manager following the desired dtype and device
     ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[args.dtype]
-    ctx = torch.amp.autocast(device_type="cuda", dtype=ptdtype) if device == "cuda" else nullcontext()
+    ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext()
 
     # seed the random number generators (in DDP we want different processes to use different offsets)
     # in the code below we don't actually use random numbers because there is no active dataloader
@@ -604,8 +632,9 @@ def get_batch():
     raw_model = model.module if ddp else model # always contains the "raw" unwrapped model
 
     # init the optimizer
-    adam_use_fused = device == "cuda" # only works on CUDA (?)
-    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, betas=(0.9, 0.95), weight_decay=0.0, fused=adam_use_fused)
+    optimizer = raw_model.configure_optimizers(weight_decay=args.weight_decay,
+                                               learning_rate=1e-4, betas=(0.9, 0.95),
+                                               device_type=device)
 
     if device == "cuda":
         torch.cuda.reset_peak_memory_stats()

From 5b96d918933077e229e770b3dc8a8995d7e8758a Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sat, 25 May 2024 00:39:40 +0000
Subject: [PATCH 171/172] attempt to fix windows?

---
 dev/unistd.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/dev/unistd.h b/dev/unistd.h
index 861041d46..af0c0ae48 100644
--- a/dev/unistd.h
+++ b/dev/unistd.h
@@ -6,7 +6,6 @@
 #define _USE_MATH_DEFINES
 
 #include <stdio.h>
-
 #include <math.h>
 //#define gen_max_length 64 // compile as C++ to skip this VLA issue
 #include <time.h>
@@ -18,14 +17,16 @@ static inline int clock_gettime(int ignore_variable, struct timespec* tv)
 }
 
 #define OMP /* turn it on */
-#include  <io.h> /* needed for access below */
+#include <io.h> /* needed for access below */
 #define F_OK 0
 #define access _access
 
 #define TURN_OFF_FP_FAST __pragma(float_control( precise, on, push )) // Save current setting and turn on /fp:precise
 #define TURN_ON_FP_FAST  __pragma(float_control(pop)) // Restore file's default settings
 
+#include <direct.h> /* for _mkdir and _stat */
 #define mkdir _mkdir // add mkdir into namespace for windows
+#define stat _stat
 
 typedef struct glob_t {
     size_t gl_pathc;    // Count of matched pathnames
@@ -57,7 +58,7 @@ static inline int glob(const char* pattern, int ignored_flags, int (*ignored_err
     strncpy_s(pattern_copy, sizeof(pattern_copy) - 1, pattern, sizeof(pattern_copy) - 1);
 
     replace_forward_slashes (pattern_copy); // Replace forward slashes with backslashes
-    
+
     if (strchr(pattern_copy, '\\') != NULL) {
         strncpy_s(directory_path, sizeof(directory_path) - 1, pattern_copy, strrchr(pattern_copy, '\\') - pattern_copy + 1);
         directory_path[strrchr(pattern_copy, '\\') - pattern_copy + 1] = '\0';

From 2a0f78d617c914242ca63069a40db3f28ae46ef0 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sat, 25 May 2024 00:44:16 +0000
Subject: [PATCH 172/172] attempt to fix windows episode 2

---
 dev/unistd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/unistd.h b/dev/unistd.h
index af0c0ae48..5569d7df3 100644
--- a/dev/unistd.h
+++ b/dev/unistd.h
@@ -25,7 +25,7 @@ static inline int clock_gettime(int ignore_variable, struct timespec* tv)
 #define TURN_ON_FP_FAST  __pragma(float_control(pop)) // Restore file's default settings
 
 #include <direct.h> /* for _mkdir and _stat */
-#define mkdir _mkdir // add mkdir into namespace for windows
+#define mkdir(path, mode) _mkdir(path) /* sketchy way to get mkdir to work on windows */
 #define stat _stat
 
 typedef struct glob_t {