From 3254a51522a339e8a1e8c18acf9ec557c20950b7 Mon Sep 17 00:00:00 2001 From: chinthysl Date: Tue, 30 Apr 2024 09:07:01 +0000 Subject: [PATCH 001/172] Zero Optimizations configs --- train_gpt2.cu | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index 845148016..9f05c37ff 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -337,6 +337,16 @@ typedef struct { int process_rank; // Rank of this process among all MPI processes. 0 if no multi-GPU. int num_processes; // Total number of processes. 1 if no multi-GPU. int local_device_idx; // This process GPU index on current machine. 0 if no multi-GPU. + + // Zero optimization stage - https://fairscale.readthedocs.io/en/stable/deep_dive/oss_sdp_fsdp.html + // 0-Disabled + // 1-Optimizer State Sharding (OSS) + // 2-Optimizer + Gradient State Sharding (SDP) + // 3-Optimizer + Gradient + Horizontal Model Sharding (FSDP) + int zero_stage; + bool zero_active; + size_t shard_num_parameters; + size_t shard_offset; #ifdef MULTI_GPU ncclComm_t nccl_comm; // NCCL communication primitive, used for collective multi-GPU work. #endif @@ -1905,7 +1915,7 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) { void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t) { // reference: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html - + // lazily allocate the memory for m_memory and v_memory if (model->m_memory == NULL) { cudaCheck(cudaMalloc((void**)&model->m_memory, model->num_parameters * sizeof(float))); @@ -2087,6 +2097,7 @@ void error_usage() { fprintf(stderr, " -m val_max_batches, up to how many val batches to estimate val loss? (default = 20)\n"); fprintf(stderr, " -s sample_every, how often we inference the model (default = 20)\n"); fprintf(stderr, " -g genT, how many steps of inference we do (default = 64)\n"); + fprintf(stderr, " -z zero_stage, Zero Optimization Stage, 0,1,2,3 (default = 0)\n"); exit(EXIT_FAILURE); } @@ -2105,6 +2116,7 @@ int main(int argc, char *argv[]) { int val_max_batches = 20; // how many batches max do we eval for validation loss? int sample_every = 20; // every how many steps to do inference? int genT = 64; // number of steps of inference we will do + int zero_stage = 0; // Zero Optimization Stage for Multi-GPU training for (int i = 1; i < argc; i+=2) { if (i + 1 >= argc) { error_usage(); } // must have arg after flag if (argv[i][0] != '-') { error_usage(); } // must start with dash @@ -2119,6 +2131,7 @@ int main(int argc, char *argv[]) { else if (argv[i][1] == 'm') { val_max_batches = atoi(argv[i+1]); } else if (argv[i][1] == 's') { sample_every = atoi(argv[i+1]); } else if (argv[i][1] == 'g') { genT = atoi(argv[i+1]); } + else if (argv[i][1] == 'z') { zero_stage = atoi(argv[i+1]); } else { error_usage(); } } printf0("+-----------------------+----------------------------------------------------+\n"); @@ -2190,7 +2203,9 @@ int main(int argc, char *argv[]) { printf0("+-----------------------+----------------------------------------------------+\n"); // pretty print in a table the multi-gpu configuration as well + set_zero_configs(&multi_gpu_config, zero_stage, model.num_parameters); printf0("| num_processes | %-50d |\n", multi_gpu_config.num_processes); + printf0("| zero_stage | %-50d |\n", multi_gpu_config.zero_stage); printf0("+-----------------------+----------------------------------------------------+\n"); // more prints related to allocations from gpt2_build_from_checkpoint down here to not mess up our table above From 2d26ec10b2b4439e4db12bd49b2e765a377982d4 Mon Sep 17 00:00:00 2001 From: chinthysl Date: Tue, 30 Apr 2024 09:08:25 +0000 Subject: [PATCH 002/172] setting the zero opt configs --- train_gpt2.cu | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/train_gpt2.cu b/train_gpt2.cu index 9f05c37ff..c835b6976 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -439,6 +439,36 @@ void printf0(const char *format, ...) { } } +void set_zero_configs(MultiGpuConfig* multi_gpu_config, int zero_stage, size_t total_parameters) { + + multi_gpu_config->zero_stage = 0; + multi_gpu_config->zero_active = false; + multi_gpu_config->shard_num_parameters = total_parameters; + multi_gpu_config->shard_offset = 0; + +#ifdef MULTI_GPU + // Check the Zero Stage and define sharding parameters + if (zero_stage == 0) { + printf0("| Zero Optimization is disabled |\n"); + } + else if (zero_stage == 1) { + if (total_parameters % multi_gpu_config->num_processes != 0) { + printf0("| Zero Optimization is disabled, Can't equally partition parameters |\n"); + } + else { + printf0("| Zero Stage1 is enabled |\n"); + multi_gpu_config->zero_stage = 1; + multi_gpu_config->zero_active = true; + multi_gpu_config->shard_num_parameters = total_parameters / multi_gpu_config->num_processes; + multi_gpu_config->shard_offset = multi_gpu_config->process_rank * (total_parameters / multi_gpu_config->num_processes); + } + } + else{ + printf0("| Disabling Zero Optimization, Zero Stage2 and Stage3 are not yet supported |\n"); + } +#endif +} + // ---------------------------------------------------------------------------- // all the kernels From b3e8abdd54921d0aad8ebd75458153806fd6d53a Mon Sep 17 00:00:00 2001 From: chinthysl Date: Tue, 30 Apr 2024 09:09:52 +0000 Subject: [PATCH 003/172] optimizer update per shard and nccl all gather --- train_gpt2.cu | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index c835b6976..586656a50 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -1943,28 +1943,39 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) { #endif } -void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t) { +void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t, MultiGpuConfig* multi_gpu_config) { // reference: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html - + size_t num_parameters = multi_gpu_config->shard_num_parameters; + size_t offset = multi_gpu_config->shard_offset; + floatX* params_memory = (floatX*)model->params_memory + offset; + floatX* grads_memory = (floatX*)model->grads_memory + offset; + // lazily allocate the memory for m_memory and v_memory if (model->m_memory == NULL) { - cudaCheck(cudaMalloc((void**)&model->m_memory, model->num_parameters * sizeof(float))); - cudaCheck(cudaMalloc((void**)&model->v_memory, model->num_parameters * sizeof(float))); - cudaCheck(cudaMemset(model->m_memory, 0, model->num_parameters * sizeof(float))); - cudaCheck(cudaMemset(model->v_memory, 0, model->num_parameters * sizeof(float))); - printf0("allocated %zu MiB for AdamW optimizer state m\n", (model->num_parameters * sizeof(float)) >> 20); - printf0("allocated %zu MiB for AdamW optimizer state v\n", (model->num_parameters * sizeof(float)) >> 20); + cudaCheck(cudaMalloc((void**)&model->m_memory, num_parameters * sizeof(float))); + cudaCheck(cudaMalloc((void**)&model->v_memory, num_parameters * sizeof(float))); + cudaCheck(cudaMemset(model->m_memory, 0, num_parameters * sizeof(float))); + cudaCheck(cudaMemset(model->v_memory, 0, num_parameters * sizeof(float))); + printf0("allocated %zu MiB for AdamW optimizer state m\n", (num_parameters * sizeof(float)) >> 20); + printf0("allocated %zu MiB for AdamW optimizer state v\n", (num_parameters * sizeof(float)) >> 20); } int block_size = 512; - int num_blocks = CEIL_DIV(model->num_parameters, block_size); + int num_blocks = CEIL_DIV(num_parameters, block_size); float beta1_correction = 1.0f - powf(beta1, t); float beta2_correction = 1.0f - powf(beta2, t); unsigned int seed = random_u32(&model->rng_state); - adamw_kernel3<<>>((floatX*)model->params_memory, (floatX*)model->grads_memory, model->m_memory, model->v_memory, - model->num_parameters, + adamw_kernel3<<>>(params_memory, grads_memory, model->m_memory, model->v_memory, + num_parameters, learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed); cudaCheck(cudaGetLastError()); + + if (multi_gpu_config->zero_active) { + // gather all parameter updates from each process + ncclCheck(ncclAllGather(params_memory, (floatX*)model->params_memory, + num_parameters, ncclFloatX, + multi_gpu_config->nccl_comm, 0)); // using default stream + } } void gpt2_free(GPT2 *model) { From 1f442fdf7f8720227d8e898295a577f0d4e15853 Mon Sep 17 00:00:00 2001 From: chinthysl Date: Tue, 30 Apr 2024 09:10:50 +0000 Subject: [PATCH 004/172] fix gpt2_update call --- train_gpt2.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index 586656a50..a958a24eb 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -2346,7 +2346,7 @@ int main(int argc, char *argv[]) { if (multi_gpu_config.num_processes > 1) { gpt2_multi_gpu_accumulate(&model, &multi_gpu_config); } - gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1); + gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1, &multi_gpu_config); cudaCheck(cudaDeviceSynchronize()); // finish all CUDA work to get correct precise timings clock_gettime(CLOCK_MONOTONIC, &end); double time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9; From d35d2e606bc8003a08b9fb5c873e3d3f7c0a3fde Mon Sep 17 00:00:00 2001 From: chinthysl Date: Thu, 2 May 2024 07:31:34 +0000 Subject: [PATCH 005/172] Generalized copy_and_cast_kernel and changes to cater model->master_weights --- train_gpt2.cu | 56 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index 6b2e59456..df44d78f9 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -358,7 +358,7 @@ typedef struct { int num_processes; // Total number of processes. 1 if no multi-GPU. int local_device_idx; // This process GPU index on current machine. 0 if no multi-GPU. - // Zero optimization stage - https://fairscale.readthedocs.io/en/stable/deep_dive/oss_sdp_fsdp.html + // Zero Redundancy Optimizer stage - https://fairscale.readthedocs.io/en/stable/deep_dive/oss_sdp_fsdp.html // 0-Disabled // 1-Optimizer State Sharding (OSS) // 2-Optimizer + Gradient State Sharding (SDP) @@ -1325,10 +1325,37 @@ __global__ void fused_classifier_kernel3(floatX* logits, floatX* losses, floatX* } } -__global__ void copy_and_cast_kernel(float* dst, const floatX* src, size_t n) { - // a small kernel to copy and cast, i.e. `dst <- (float) src` - const size_t i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n) { dst[i] = (float)src[i]; } +// device functions and the kernel to cast data between types +template +__device__ Td cast_value(Ts val); + +template<> +__device__ float cast_value(half val) { + return __half2float(val); +} + +template<> +__device__ half cast_value(float val) { + return __float2half(val); +} + +template<> +__device__ __nv_bfloat16 cast_value<__nv_bfloat16, float>(float val) { + return __float2bfloat16(val); +} + +template<> +__device__ float cast_value(__nv_bfloat16 val) { + return __bfloat162float(val); +} + +template +__global__ void copy_and_cast_kernel(Td* dst, const Ts* src, size_t n) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + // need to try grid stride looping for more perf later + if (idx < n) { + dst[idx] = cast_value(src[idx]); + } } // ---------------------------------------------------------------------------- @@ -2282,10 +2309,6 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo size_t offset = multi_gpu_config->shard_offset; floatX* params_memory = (floatX*)model->params_memory + offset; floatX* grads_memory = (floatX*)model->grads_memory + offset; - float* master_params = NULL; - if (model->use_master_weights == 1) { - master_weights = model->master_weights + offset; - } // lazily allocate the memory for m_memory and v_memory if (model->m_memory == NULL) { @@ -2304,6 +2327,11 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo } } + float* master_weights = NULL; + if (model->use_master_weights == 1) { + master_weights = model->master_weights + offset; + } + int block_size = 512; int num_blocks = CEIL_DIV(num_parameters, block_size); float beta1_correction = 1.0f - powf(beta1, t); @@ -2314,17 +2342,13 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo cudaCheck(cudaGetLastError()); if (multi_gpu_config->zero_active) { - // gather all parameter updates from each process, should use 2 cudastreams in future - ncclCheck(ncclAllGather(params_memory, (floatX*)model->params_memory, - num_parameters, ncclFloatX, - multi_gpu_config->nccl_comm, 0)); + // gather all parameter updates from each process if (model->use_master_weights == 1) { ncclCheck(ncclAllGather(master_weights, model->master_weights, num_parameters, ncclFloat, multi_gpu_config->nccl_comm, 0)); - // Fix and generalize the kernel - // copy_and_cast_kernel<<num_parameters, 512), 512>>>((floatX*)model->params_memory, model->master_weights, model->num_parameters); - + // Copy and cast gathered master weights to params + copy_and_cast_kernel<<num_parameters, 512), 512>>>((floatX*)model->params_memory, model->master_weights, model->num_parameters); } else { ncclCheck(ncclAllGather(params_memory, (floatX*)model->params_memory, From 632caf1ce645695754ce2032b9e8ef126c97574c Mon Sep 17 00:00:00 2001 From: chinthysl Date: Thu, 2 May 2024 07:38:49 +0000 Subject: [PATCH 006/172] Refactored zero_active var --- train_gpt2.cu | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index df44d78f9..ba15a9b02 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -364,7 +364,6 @@ typedef struct { // 2-Optimizer + Gradient State Sharding (SDP) // 3-Optimizer + Gradient + Horizontal Model Sharding (FSDP) int zero_stage; - bool zero_active; size_t shard_num_parameters; size_t shard_offset; #ifdef MULTI_GPU @@ -462,7 +461,6 @@ void printf0(const char *format, ...) { void set_zero_configs(MultiGpuConfig* multi_gpu_config, int zero_stage, size_t total_parameters) { multi_gpu_config->zero_stage = 0; - multi_gpu_config->zero_active = false; multi_gpu_config->shard_num_parameters = total_parameters; multi_gpu_config->shard_offset = 0; @@ -474,17 +472,18 @@ void set_zero_configs(MultiGpuConfig* multi_gpu_config, int zero_stage, size_t t else if (zero_stage == 1) { if (total_parameters % multi_gpu_config->num_processes != 0) { printf0("| Zero Optimization is disabled, Can't equally partition parameters |\n"); + multi_gpu_config->zero_stage = 0; } else { printf0("| Zero Stage1 is enabled |\n"); multi_gpu_config->zero_stage = 1; - multi_gpu_config->zero_active = true; multi_gpu_config->shard_num_parameters = total_parameters / multi_gpu_config->num_processes; multi_gpu_config->shard_offset = multi_gpu_config->process_rank * (total_parameters / multi_gpu_config->num_processes); } } else{ printf0("| Disabling Zero Optimization, Zero Stage2 and Stage3 are not yet supported |\n"); + multi_gpu_config->zero_stage = 0; } #endif } @@ -2341,7 +2340,7 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed); cudaCheck(cudaGetLastError()); - if (multi_gpu_config->zero_active) { + if (multi_gpu_config->zero_stage == 1) { // gather all parameter updates from each process if (model->use_master_weights == 1) { ncclCheck(ncclAllGather(master_weights, model->master_weights, From c81adeb6b254e8fdc4b9fad722c69442806bdb6e Mon Sep 17 00:00:00 2001 From: lancer Date: Thu, 2 May 2024 17:36:12 -0700 Subject: [PATCH 007/172] move gelu_backward to backward block --- dev/cuda/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/cuda/Makefile b/dev/cuda/Makefile index 4ea763762..834a98b0f 100644 --- a/dev/cuda/Makefile +++ b/dev/cuda/Makefile @@ -26,7 +26,6 @@ attention_forward: attention_forward.cu classifier_fused: classifier_fused.cu crossentropy_forward: crossentropy_forward.cu encoder_forward: encoder_forward.cu -gelu_backward: gelu_backward.cu gelu_forward: gelu_forward.cu layernorm_forward: layernorm_forward.cu residual_forward: residual_forward.cu @@ -40,6 +39,7 @@ matmul_forward: matmul_forward.cu attention_backward: attention_backward.cu crossentropy_softmax_backward: crossentropy_softmax_backward.cu encoder_backward: encoder_backward.cu +gelu_backward: gelu_backward.cu layernorm_backward: layernorm_backward.cu matmul_backward_bias: matmul_backward_bias.cu matmul_backward: matmul_backward.cu From 59b66f6c07d36c1e7307437f8e1e36470d7ecf12 Mon Sep 17 00:00:00 2001 From: chinthysl Date: Sat, 4 May 2024 16:22:44 +0800 Subject: [PATCH 008/172] refactor and fix CI issue --- Makefile | 2 +- profile_gpt2.cu | 2 +- test_gpt2.cu | 2 +- train_gpt2.cu | 68 +++++++++++++++++++++++++++++++------------------ 4 files changed, 46 insertions(+), 28 deletions(-) diff --git a/Makefile b/Makefile index 06923801d..ae43f4c75 100644 --- a/Makefile +++ b/Makefile @@ -207,7 +207,7 @@ test_gpt2fp32cu: test_gpt2_fp32.cu $(NVCC) $(NVCC_FLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(NVCC_LDFLAGS) $(CUDA_OUTPUT_FILE) profile_gpt2cu: profile_gpt2.cu - $(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $< $(NVCC_LDFLAGS) $(CUDA_OUTPUT_FILE) + $(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $< $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(NVCC_LDFLAGS) $(CUDA_OUTPUT_FILE) clean: $(REMOVE_FILES) $(TARGETS) diff --git a/profile_gpt2.cu b/profile_gpt2.cu index f412eed89..445d9dbe6 100644 --- a/profile_gpt2.cu +++ b/profile_gpt2.cu @@ -76,7 +76,7 @@ int main() { gpt2_forward(&model, x, y, B, T); gpt2_zero_grad(&model); gpt2_backward(&model); - gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1); + gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1, model.num_parameters, 0); cudaCheck(cudaDeviceSynchronize()); // finish all CUDA work to get correct precise timings // free gpt2_free(&model); diff --git a/test_gpt2.cu b/test_gpt2.cu index 67afa5065..c6ca4a8a8 100644 --- a/test_gpt2.cu +++ b/test_gpt2.cu @@ -281,7 +281,7 @@ int main(int argc, char *argv[]) { allok = allok & check_tensor(tensors1[15], tensors2[15], C, "lnfb", 3e-2f); } - gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.01f, step+1); + gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.01f, step+1, model.num_parameters, 0); // print the timing information at the end printf("step %d: loss %f (took %f ms)\n", step+1, model.mean_loss, time_elapsed_s * 1000); diff --git a/train_gpt2.cu b/train_gpt2.cu index 4f0e32dd3..19dd000e9 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -1379,6 +1379,11 @@ __global__ void fused_classifier_kernel3(floatX* logits, floatX* losses, floatX* template __device__ Td cast_value(Ts val); +template<> +__device__ float cast_value(float val) { + return val; +} + template<> __device__ float cast_value(half val) { return __half2float(val); @@ -2373,6 +2378,11 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) { // Average all losses. model->accumulated_mean_loss = multi_gpu_cpu_float_mean(model->mean_loss, multi_gpu_config); #ifdef MULTI_GPU + // all gather is only required when num_processes > 1 + if (multi_gpu_config->num_processes == 1) { + return; + } + // Average all gradients. ncclCheck(ncclAllReduce(model->grads_memory, model->grads_memory, model->num_parameters, @@ -2383,22 +2393,18 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) { #endif } -void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t, MultiGpuConfig* multi_gpu_config) { +void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t, size_t shard_num_parameters, size_t shard_offset) { NVTX_RANGE_FN(); // reference: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html - size_t num_parameters = multi_gpu_config->shard_num_parameters; - size_t offset = multi_gpu_config->shard_offset; - floatX* params_memory = (floatX*)model->params_memory + offset; - floatX* grads_memory = (floatX*)model->grads_memory + offset; - // lazily allocate the memory for m_memory and v_memory + // lazily allocate the memory for m_memory and v_memory according to shard configs if (model->m_memory == NULL) { - cudaCheck(cudaMalloc((void**)&model->m_memory, num_parameters * sizeof(float))); - cudaCheck(cudaMalloc((void**)&model->v_memory, num_parameters * sizeof(float))); - cudaCheck(cudaMemset(model->m_memory, 0, num_parameters * sizeof(float))); - cudaCheck(cudaMemset(model->v_memory, 0, num_parameters * sizeof(float))); - printf0("allocated %zu MiB for AdamW optimizer state m\n", (num_parameters * sizeof(float)) >> 20); - printf0("allocated %zu MiB for AdamW optimizer state v\n", (num_parameters * sizeof(float)) >> 20); + cudaCheck(cudaMalloc((void**)&model->m_memory, shard_num_parameters * sizeof(float))); + cudaCheck(cudaMalloc((void**)&model->v_memory, shard_num_parameters * sizeof(float))); + cudaCheck(cudaMemset(model->m_memory, 0, shard_num_parameters * sizeof(float))); + cudaCheck(cudaMemset(model->v_memory, 0, shard_num_parameters * sizeof(float))); + printf0("allocated %zu MiB for AdamW optimizer state m\n", (shard_num_parameters * sizeof(float)) >> 20); + printf0("allocated %zu MiB for AdamW optimizer state v\n", (shard_num_parameters * sizeof(float)) >> 20); if (model->use_master_weights == 1) { // allocate one more buffer to keep the master copy of weights as float, and copy the weights over cudaCheck(cudaMalloc((void**)&model->master_weights, model->num_parameters * sizeof(float))); @@ -2408,35 +2414,48 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo } } + floatX* params_memory = (floatX*)model->params_memory + shard_offset; + floatX* grads_memory = (floatX*)model->grads_memory + shard_offset; float* master_weights = NULL; if (model->use_master_weights == 1) { - master_weights = model->master_weights + offset; + master_weights = model->master_weights + shard_offset; } int block_size = 512; - int num_blocks = CEIL_DIV(num_parameters, block_size); + int num_blocks = CEIL_DIV(shard_num_parameters, block_size); float beta1_correction = 1.0f - powf(beta1, t); float beta2_correction = 1.0f - powf(beta2, t); unsigned int seed = random_u32(&model->rng_state); - adamw_kernel3<<>>(params_memory, master_weights, grads_memory, model->m_memory, model->v_memory, num_parameters, + adamw_kernel3<<>>(params_memory, master_weights, grads_memory, model->m_memory, model->v_memory, shard_num_parameters, learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed); cudaCheck(cudaGetLastError()); +} + +void gpt2_multi_gpu_gather(GPT2 *model, MultiGpuConfig* multi_gpu_config) +{ +#ifdef MULTI_GPU + // all gather is only required when num_processes > 1 + if (multi_gpu_config->num_processes == 1) { + return; + } if (multi_gpu_config->zero_stage == 1) { // gather all parameter updates from each process if (model->use_master_weights == 1) { - ncclCheck(ncclAllGather(master_weights, model->master_weights, - num_parameters, ncclFloat, + ncclCheck(ncclAllGather(model->master_weights + multi_gpu_config->shard_offset, model->master_weights, + multi_gpu_config->shard_num_parameters, ncclFloat, multi_gpu_config->nccl_comm, 0)); - // Copy and cast gathered master weights to params + // Copy and cast master weights to params copy_and_cast_kernel<<num_parameters, 512), 512>>>((floatX*)model->params_memory, model->master_weights, model->num_parameters); } else { - ncclCheck(ncclAllGather(params_memory, (floatX*)model->params_memory, - num_parameters, ncclFloatX, + ncclCheck(ncclAllGather((floatX*)model->params_memory + multi_gpu_config->shard_offset, (floatX*)model->params_memory, + multi_gpu_config->shard_num_parameters, ncclFloatX, multi_gpu_config->nccl_comm, 0)); } - } + } + cudaCheck(cudaGetLastError()); +#endif } void gpt2_free(GPT2 *model) { @@ -2846,10 +2865,9 @@ int main(int argc, char *argv[]) { gpt2_forward(&model, train_loader.inputs, train_loader.targets, B, T); gpt2_zero_grad(&model); gpt2_backward(&model); - if (multi_gpu_config.num_processes > 1) { - gpt2_multi_gpu_accumulate(&model, &multi_gpu_config); - } - gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1, &multi_gpu_config); + gpt2_multi_gpu_accumulate(&model, &multi_gpu_config); + gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1, multi_gpu_config.shard_num_parameters, multi_gpu_config.shard_offset); + gpt2_multi_gpu_gather(&model, &multi_gpu_config); cudaEventRecord(end); float time_elapsed_ms; From bfb9c51446a5b0219c8fa8a2d4ffc06414293f3a Mon Sep 17 00:00:00 2001 From: ademeure Date: Sat, 4 May 2024 23:40:15 +0100 Subject: [PATCH 009/172] refactoring & remove unused functions to reduce LOC (+wip profile.py improvements) --- profile_gpt2.cu | 51 +--- profile_gpt2cu.py | 19 +- test_gpt2.cu | 49 +-- train_gpt2.cu | 761 +++++++++++++++++++--------------------------- 4 files changed, 328 insertions(+), 552 deletions(-) diff --git a/profile_gpt2.cu b/profile_gpt2.cu index eab7fc58e..fd1d78d94 100644 --- a/profile_gpt2.cu +++ b/profile_gpt2.cu @@ -28,44 +28,7 @@ the profile.ncu-rep from a cloud box to local to pretty view. #include "train_gpt2.cu" int main() { - - // set up the device - int deviceIdx = 0; - cudaCheck(cudaSetDevice(deviceIdx)); - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, deviceIdx); - printf("[System]\n"); - printf("Device %d: %s\n", deviceIdx, deviceProp.name); - - cuda_num_SMs = deviceProp.multiProcessorCount; - cuda_threads_per_SM = deviceProp.maxThreadsPerMultiProcessor; - cuda_arch_major = deviceProp.major; - cuda_arch_minor = deviceProp.minor; - - cudaCheck(cudaStreamCreate(&main_stream)); - cudaEventCreateWithFlags(&main_event, cudaEventDisableTiming); - cudaEventCreateWithFlags(&loss_event, cudaEventDisableTiming); - for (int i = 0; i < num_parallel_streams; i++) { - cudaCheck(cudaStreamCreate(¶llel_streams[i])); - cudaEventCreateWithFlags(¶llel_events[i], cudaEventDisableTiming); - } - - // setup cuBLAS and cuBLASLt - cublasCheck(cublasCreate(&cublas_handle)); - cublasCheck(cublasSetStream(cublas_handle, main_stream)); - cublasCheck(cublasLtCreate(&cublaslt_handle)); - // TF32 precision is equivalent to torch.set_float32_matmul_precision('high') - int enable_tf32 = deviceProp.major >= 8 ? 1 : 0; - printf("enable_tf32: %d\n", enable_tf32); - cublas_compute_type = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F; - cublasMath_t cublas_math_mode = enable_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH; - cublasCheck(cublasSetMathMode(cublas_handle, cublas_math_mode)); - // setup the (global) cuBLASLt workspace - cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size)); - - #ifdef ENABLE_CUDNN - checkCudnnErr(cudnnCreate(&cudnn_handle)); - #endif + common_start(); // build the GPT-2 model from a checkpoint GPT2 model; @@ -91,16 +54,8 @@ int main() { gpt2_backward(&model); gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1); cudaCheck(cudaDeviceSynchronize()); // finish all CUDA work to get correct precise timings - // free - gpt2_free(&model); - - #ifdef ENABLE_CUDNN - if (cudnn_workspace != NULL) { cudaCheck(cudaFree(cudnn_workspace)); } - checkCudnnErr(cudnnDestroy(cudnn_handle)); - #endif - cudaCheck(cudaFree(cublaslt_workspace)); - cublasCheck(cublasDestroy(cublas_handle)); - cublasCheck(cublasLtDestroy(cublaslt_handle)); + // free + common_free(model); return 0; } diff --git a/profile_gpt2cu.py b/profile_gpt2cu.py index b3eec863a..8e15b7dc2 100644 --- a/profile_gpt2cu.py +++ b/profile_gpt2cu.py @@ -31,7 +31,7 @@ "dram__bytes_write.sum", # DRAM writes "lts__t_sectors_srcunit_tex_op_read.sum", # L2 reads (sectors -- 32B) "lts__t_sectors_srcunit_tex_op_write.sum", # L2 reads (sectors -- 32B) - "smsp__inst_executed.sum", # instructions + "sm__pipe_tensor_op_hmma_cycles_active.avg.pct_of_peak_sustained_active", # todo - tensor core % ] cmd = [NCU, "-i", "profile.ncu-rep", "--csv", "--page", "raw", "--metrics", ",".join(metrics)] result = subprocess.check_output(cmd, text=True).strip() @@ -55,11 +55,11 @@ for rid, row in enumerate(reader): if rid == 0: # headings - print(f"id pass {'name':<40} {'time':>8} {'RAM rd':>8} {'RAM wt':>8} {'L2 rd':>8} {'L2 wt':>8} {'inst':>8}") + print(f"id pass {'name':<70} {'time':>8} {'RAM BW':>8} {'RAM rd':>8} {'RAM wt':>8} {'L2 rd':>8} {'L2 wt':>8} {'inst':>8}") continue if rid == 1: # units - units = f" {'':<40} {'ms':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'MInst':>8}" + units = f" {'':<70} {'ms':>8} {'GB/s':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'MInst':>8}" print(units) print("." * len(units)) continue @@ -74,7 +74,7 @@ write = float(row[12]) l2_read = float(row[14]) l2_write = float(row[15]) - inst = float(row[16]) / 1e6 + inst = float(row[16]) kid = rid - 2 @@ -118,18 +118,21 @@ total['l2_write'] += l2_write total['inst'] += inst - print(f"{kid:02} {pass_name:4} {fn_name:<40} {time:8.2f} {read:8.2f} {write:8.2f} {l2_read:8.2f} {l2_write:8.2f} {inst:8.2f}") + dram_bw = (read + write) / (time / 1000.0); + + print(f"{kid:02} {pass_name:4} {fn_name:<70} {time:8.2f} {dram_bw:8.1f} {read:8.2f} {write:8.2f} {l2_read:8.2f} {l2_write:8.2f} {inst:8.2f}") total_time = total['time'] +total_dram_bw = (total['read'] + total['write']) / (total_time / 1000.0); print("." * len(units)) -print(f" {'Total':<40} {total['time']:8.2f} {total['read']:8.2f} {total['write']:8.2f} {total['l2_read']:8.2f} {total['l2_write']:8.2f} {total['inst']:8.2f}") +print(f" {'Total':<70} {total['time']:8.2f} {total_dram_bw:8.1f} {total['read']:8.2f} {total['write']:8.2f} {total['l2_read']:8.2f} {total['l2_write']:8.2f} {total['inst']:8.2f}") print() print("Kernel type summaries:") -print(f" {'name':<40} {'time':>6} {'frac':>6}") +print(f" {'name':<70} {'time':>6} {'frac':>6}") ordered = sorted(summaries.items(), key=lambda x: x[1], reverse=True) for entry, value in ordered: - print(f" {entry:<40} {value:6.2f} {100*value / total_time:6.2f}%") + print(f" {entry:<70} {value:6.2f} {100*value / total_time:6.2f}%") ts = total_time / 1000 diff --git a/test_gpt2.cu b/test_gpt2.cu index 9c98f5684..3fc6b6f0e 100644 --- a/test_gpt2.cu +++ b/test_gpt2.cu @@ -83,44 +83,7 @@ float* float_cpu_malloc_and_point_parameters(FloatParameterTensors* params, size } int main(int argc, char *argv[]) { - - // set up the device - int deviceIdx = 0; - cudaCheck(cudaSetDevice(deviceIdx)); - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, deviceIdx); - printf("[System]\n"); - printf("Device %d: %s\n", deviceIdx, deviceProp.name); - - cuda_num_SMs = deviceProp.multiProcessorCount; - cuda_threads_per_SM = deviceProp.maxThreadsPerMultiProcessor; - cuda_arch_major = deviceProp.major; - cuda_arch_minor = deviceProp.minor; - - cudaCheck(cudaStreamCreate(&main_stream)); - cudaEventCreateWithFlags(&main_event, cudaEventDisableTiming); - cudaEventCreateWithFlags(&loss_event, cudaEventDisableTiming); - for (int i = 0; i < num_parallel_streams; i++) { - cudaCheck(cudaStreamCreate(¶llel_streams[i])); - cudaEventCreateWithFlags(¶llel_events[i], cudaEventDisableTiming); - } - - // setup cuBLAS and cuBLASLt - cublasCheck(cublasCreate(&cublas_handle)); - cublasCheck(cublasSetStream(cublas_handle, main_stream)); - cublasCheck(cublasLtCreate(&cublaslt_handle)); - // TF32 precision is equivalent to torch.set_float32_matmul_precision('high') - int enable_tf32 = cuda_arch_major >= 8 ? 1 : 0; - enable_tf32 = 0; // NOTE: disable TF32 for testing!!! - printf("enable_tf32: %d\n", enable_tf32); - cublas_compute_type = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F; - cublasMath_t cublas_math_mode = enable_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH; - cublasCheck(cublasSetMathMode(cublas_handle, cublas_math_mode)); - cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size)); - - #ifdef ENABLE_CUDNN - checkCudnnErr(cudnnCreate(&cudnn_handle)); - #endif + common_start(false); // build the GPT-2 model from a checkpoint GPT2 model; @@ -327,6 +290,7 @@ int main(int argc, char *argv[]) { printf("overall okay: %d\n", allok); // free everything + common_free(model); free(x); free(y); free(logits_cpu_raw); @@ -336,14 +300,5 @@ int main(int argc, char *argv[]) { free(expected_grads_memory); free(grads_memory_cpu); free(grads_memory_cpu_float); - gpt2_free(&model); - #ifdef ENABLE_CUDNN - if (cudnn_workspace != NULL) { cudaCheck(cudaFree(cudnn_workspace)); } - checkCudnnErr(cudnnDestroy(cudnn_handle)); - #endif - cudaCheck(cudaFree(cublaslt_workspace)); - cublasCheck(cublasDestroy(cublas_handle)); - cublasCheck(cublasLtDestroy(cublaslt_handle)); - return 0; } diff --git a/train_gpt2.cu b/train_gpt2.cu index 971e9be27..2a86fdec5 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -11,7 +11,7 @@ sure that those parts work out ok and that we do a += as necessary. E.g., the layernorms are connected to the residuals so we += in layernorm backward. In this file we are using Mixed Precision training, so different activations, -paramaters, grads and buffers may be kept at different precisions, to take +parameters, grads and buffers may be kept at different precisions, to take advantage of the fast low-precision hardware in the latest GPUs (bf16/fp16), and fp8 (coming soon^TM). @@ -33,26 +33,15 @@ This reads & runs in fp32, B=4, T=64, LR=1e-4, val/sample never (200), -a 1 is "overfit single batch", -x 10 is 10 iterations, and -f 0 disables tf32 */ -#include - #include -#include #include -#include -#include -#include -#include -#include -#include -#include +#include // GPU / CUDA related -#include -#include #include +#include #include -#include #include - +#include // Multi-GPU related #ifdef MULTI_GPU #include @@ -73,20 +62,12 @@ enum PrecisionMode { PRECISION_BF16 }; -// Default Properties -typedef float floatN; -#define CUBLAS_LOWP_COMPUTE cublas_compute_type -#ifdef MULTI_GPU -const ncclDataType_t ncclFloatN = ncclFloat; -#endif - // Specific configurations based on the enabled precision #if defined(ENABLE_FP32) typedef float floatX; #define CUBLAS_LOWP CUDA_R_32F #define PRECISION_MODE PRECISION_FP32 const char* load_filename = "gpt2_124M.bin"; -const char* precision_mode_str = "fp32"; #ifdef MULTI_GPU const ncclDataType_t ncclFloatX = ncclFloat; #endif @@ -97,7 +78,6 @@ typedef half floatX; #define CUBLAS_LOWP CUDA_R_16F #define PRECISION_MODE PRECISION_FP16 const char* load_filename = "gpt2_124M.bin"; -const char* precision_mode_str = "fp16"; #ifdef MULTI_GPU const ncclDataType_t ncclFloatX = ncclHalf; #endif @@ -107,7 +87,6 @@ typedef __nv_bfloat16 floatX; #define CUBLAS_LOWP CUDA_R_16BF #define PRECISION_MODE PRECISION_BF16 const char* load_filename = "gpt2_124M_bf16.bin"; // bf16 weights specific filename -const char* precision_mode_str = "bf16"; #ifdef MULTI_GPU const ncclDataType_t ncclFloatX = ncclBfloat16; #endif @@ -121,11 +100,9 @@ namespace fe = cudnn_frontend; #else #define CUDNN_16BIT fe::DataType_t::HALF #endif - static cudnnHandle_t cudnn_handle; static size_t cudnn_workspace_size = 0; // dynamically allocated as needed (up to 256MiB!) static void* cudnn_workspace = NULL; -#define checkCudnnErr(err) assert((int)err == 0); #endif // ENABLE_CUDNN // ---------------------------------------------------------------------------- @@ -144,17 +121,14 @@ class NvtxRange { #define NVTX_RANGE_FN() NvtxRange nvtx_range(__FUNCTION__) // cuBLAS workspace. Hardcoding to 32MiB but only Hopper needs 32, for others 4 is OK -static size_t cublaslt_workspace_size = 32 * 1024 * 1024; -static void* cublaslt_workspace = NULL; -static cublasComputeType_t cublas_compute_type; -cublasHandle_t cublas_handle; +const size_t cublaslt_workspace_size = 32 * 1024 * 1024; +void* cublaslt_workspace = NULL; +cublasComputeType_t cublas_compute = CUBLAS_COMPUTE_32F; cublasLtHandle_t cublaslt_handle; -int cuda_arch_major = 0; -int cuda_arch_minor = 0; -int cuda_num_SMs = 0; // for persistent threads where we want 1 threadblock per SM -int cuda_threads_per_SM = 0; +cublasHandle_t cublas_handle; +cudaDeviceProp deviceProp; -// CUDA streams & events (note: non-timing events, use separate event for timing/profiling!) +// CUDA streams & events (note: non-timing events, use separate events for timing/profiling!) constexpr int num_parallel_streams = 2; // + 1 primary "main_stream" (+ default stream) cudaStream_t parallel_streams[num_parallel_streams]; cudaEvent_t parallel_events[num_parallel_streams]; @@ -168,8 +142,7 @@ cudaEvent_t loss_event; // to make sure fused_classifier has written the losses // CUDA error checking void cudaCheck(cudaError_t error, const char *file, int line) { if (error != cudaSuccess) { - printf("[CUDA ERROR] at file %s:%d:\n%s\n", file, line, - cudaGetErrorString(error)); + printf("[CUDA ERROR] at file %s:%d:\n%s\n", file, line, cudaGetErrorString(error)); exit(EXIT_FAILURE); } }; @@ -206,35 +179,6 @@ void mpi_check(int status, const char *file, int line) { #define mpiCheck(err) (mpi_check(err, __FILE__, __LINE__)) #endif -// GPU helper functions for atomicAdd on smaller than 32-bit types -#ifdef ENABLE_BF16 -__device__ void atomicAddX(__nv_bfloat16* addr, __nv_bfloat16 val) { - uintptr_t ptr_val = reinterpret_cast(addr); - __nv_bfloat162* ptr_bf16 = reinterpret_cast<__nv_bfloat162*>(ptr_val & ~uintptr_t(0x3)); - - // Prepare the value to add, setting the other half to zero - __nv_bfloat162 add_val = (ptr_val & 0x3) ? __halves2bfloat162(__ushort_as_bfloat16(0), val) - : __halves2bfloat162(val, __ushort_as_bfloat16(0)); - atomicAdd(ptr_bf16, add_val); -} -#endif - -#ifdef ENABLE_FP16 -__device__ void atomicAddX(half* addr, half val) { - uintptr_t ptr_val = reinterpret_cast(addr); - half2* ptr_fp16 = reinterpret_cast(ptr_val & ~uintptr_t(0x3)); - - // Prepare the value to add, setting the other half to zero - half2 add_val = (ptr_val & 0x3) ? __halves2half2(__ushort_as_half(0), val) - : __halves2half2(val, __ushort_as_half(0)); - atomicAdd(ptr_fp16, add_val); -} -#endif - -__device__ void atomicAddX(float* addr, float val) { - atomicAdd(addr, val); -} - // warp-level reduction for summing values __device__ float warpReduceSum(float val) { for (int offset = 16; offset > 0; offset /= 2) { @@ -242,7 +186,6 @@ __device__ float warpReduceSum(float val) { } return val; } - // warp-level reduction for finding the maximum value __device__ float warpReduceMax(float val) { for (int offset = 16; offset > 0; offset /= 2) { @@ -250,16 +193,6 @@ __device__ float warpReduceMax(float val) { } return val; } - -#if defined(ENABLE_BF16) || defined(ENABLE_FP16) -__device__ floatX warpReduceSum(floatX val) { - for (int offset = 16; offset > 0; offset /= 2) { - val += __shfl_xor_sync(0xFFFFFFFF, val, offset); - } - return val; -} -#endif - // requires all 32 threads in the warp to be active, but should work for any block size // uses non-dynamic shared memory so every call increases shared memory requirements by 128 bytes // the fact it's unique shared memory allows us to avoid an extra __syncthreads() call at the end @@ -270,14 +203,13 @@ __device__ float blockReduce(float val, bool final_sync=false, float out_of_boun // two reductions of up to 1024 threads: // 1) inside warp (shuffle), 2) cross-warp (shared memory), 3) inside warp (shuffle) __shared__ float shared_val[32]; - int lane_id = threadIdx.x % 32; - int warp_id = threadIdx.x / 32; - int num_warps = blockDim.x / 32; + const int lane_id = threadIdx.x % 32; + const int warp_id = threadIdx.x / 32; + const int num_warps = blockDim.x / 32; float warp_val = warp_reduction(val); if (lane_id == 0) { shared_val[warp_id] = warp_val; } __syncthreads(); - // same strategy, now reduce across warps warp_val = (lane_id < num_warps) ? shared_val[lane_id] : out_of_bounds; float block_val = warp_reduction(warp_val); @@ -287,7 +219,6 @@ __device__ float blockReduce(float val, bool final_sync=false, float out_of_boun return block_val; } - // ---------------------------------------------------------------------------- // Packed128 data structure, which forces the compiler to use 128-bit loads/stores // in GPUs that support (the LDG.128 and STS.128 instructions) @@ -296,12 +227,11 @@ __device__ float blockReduce(float val, bool final_sync=false, float out_of_boun template struct alignas(16) Packed128 { - __device__ Packed128() = default; + Packed128() = default; __device__ explicit Packed128(int4 bits) { static_assert(sizeof(bits) == sizeof(payload), "Size mismatch."); memcpy(&payload, &bits, sizeof(bits)); } - __device__ ElementType& operator[](int index) { return payload[index]; } @@ -314,39 +244,35 @@ struct alignas(16) Packed128 { memcpy(&bits, &payload, sizeof(bits)); return bits; } - static constexpr const size_t size = sizeof(int4) / sizeof(ElementType); ElementType payload[size]; }; -// short-form typedef -typedef Packed128 f128; -typedef Packed128 x128; - // load a Packed128 from an aligned memory address template __device__ Packed128 load128(const ElementType* address) { return Packed128{*reinterpret_cast(address)}; } - // load a Packed128 from an aligned memory address with streaming cache hint template __device__ Packed128 load128cs(const ElementType* address) { return Packed128{__ldcs(reinterpret_cast(address))}; } - // store a Packed128 to an aligned memory address template __device__ void store128(ElementType* target, Packed128 value) { *reinterpret_cast(target) = value.get_bits(); } - // store a Packed128 to an aligned memory address with streaming cache hint template __device__ void store128cs(ElementType* target, Packed128 value) { __stcs(reinterpret_cast(target), value.get_bits()); } +// short-form typedefs +typedef Packed128 f128; +typedef Packed128 x128; + // ---------------------------------------------------------------------------- // Random Number Generatiom @@ -387,25 +313,11 @@ __device__ __host__ constexpr unsigned int SquirrelNoise5(int positionX, unsigne mangledBits ^= (mangledBits >> 17); return mangledBits; } -__device__ __host__ constexpr unsigned int Get1dNoiseUint(int positionX, unsigned int seed) -{ - return SquirrelNoise5(positionX, seed); -} __device__ __host__ constexpr unsigned int Get2dNoiseUint(int indexX, int indexY, unsigned int seed) { constexpr int PRIME_NUMBER = 198491317; // Large prime number with non-boring bits return SquirrelNoise5(indexX + (PRIME_NUMBER * indexY), seed); } -__device__ __host__ constexpr float Get1dNoiseZeroToOne(int index, unsigned int seed) -{ - constexpr double ONE_OVER_MAX_UINT = (1.0 / (double) 0xFFFFFFFF); - return (float)(ONE_OVER_MAX_UINT * (double) SquirrelNoise5(index, seed)); -} -__device__ __host__ constexpr float Get2dNoiseZeroToOne(int indexX, int indexY, unsigned int seed) -{ - constexpr double ONE_OVER_MAX_UINT = (1.0 / (double) 0xFFFFFFFF); - return (float)(ONE_OVER_MAX_UINT * (double) Get2dNoiseUint(indexX, indexY, seed)); -} // stochastic rounding built on top of Squirel Noise above (with seed updated per step via xorshift) __device__ __forceinline__ void stochastic_rounding(float in, __nv_bfloat16 *out, unsigned int seed) { @@ -564,20 +476,16 @@ auto lookup_cache_or_build_graph_fwd(Args... args) { .set_compute_data_type(fe::DataType_t::FLOAT); // QKV is (B, T, 3, NH, HS) which cuDNN can handle directly without an external permute - auto Q = graph->tensor(fe::graph::Tensor_attributes() - .set_name("Q") + auto Q = graph->tensor(fe::graph::Tensor_attributes().set_name("Q") .set_dim({B, H, T, HS}) .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1})); - auto K = graph->tensor(fe::graph::Tensor_attributes() - .set_name("K") + auto K = graph->tensor(fe::graph::Tensor_attributes().set_name("K") .set_dim({B, H, T, HS}) .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1})); - auto V = graph->tensor(fe::graph::Tensor_attributes() - .set_name("V") + auto V = graph->tensor(fe::graph::Tensor_attributes().set_name("V") .set_dim({B, H, T, HS}) .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1})); - auto attn_scale = graph->tensor(fe::graph::Tensor_attributes() - .set_name("attn_scale") + auto attn_scale = graph->tensor(fe::graph::Tensor_attributes().set_name("attn_scale") .set_dim({1, 1, 1, 1}) .set_stride({1, 1, 1, 1}) .set_is_pass_by_value(true) @@ -613,6 +521,7 @@ auto lookup_cache_or_build_graph_fwd(Args... args) { auto plans = graph->create_execution_plans({fe::HeurMode_t::A}); assert(graph->check_support(cudnn_handle).is_good()); assert(graph->build_plans(cudnn_handle).is_good()); + assert(graph->get_workspace_size() <= cudnn_workspace_size); // fwd shouldn't need workspace auto tuple = std::make_tuple(graph, Q, K, V, attn_scale, O, stats); user_maintained_cache_fwd.insert({key, tuple}); @@ -631,40 +540,32 @@ auto lookup_cache_or_build_graph_bwd(Args... args) { // (B, N, 3, NH, HS) // must come from inp (which means we also need to convert THAT to FP16) - auto Q = graph->tensor(fe::graph::Tensor_attributes() - .set_name("Q") + auto Q = graph->tensor(fe::graph::Tensor_attributes().set_name("Q") .set_dim({B, NH, T, HS}) .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1})); - auto K = graph->tensor(fe::graph::Tensor_attributes() - .set_name("K") + auto K = graph->tensor(fe::graph::Tensor_attributes().set_name("K") .set_dim({B, NH, T, HS}) .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1})); - auto V = graph->tensor(fe::graph::Tensor_attributes() - .set_name("V") + auto V = graph->tensor(fe::graph::Tensor_attributes().set_name("V") .set_dim({B, NH, T, HS}) .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1})); - auto O = graph->tensor(fe::graph::Tensor_attributes() - .set_name("O") + auto O = graph->tensor(fe::graph::Tensor_attributes().set_name("O") .set_dim({B, NH, T, HS}) .set_stride({NH * HS * T, HS, NH * HS, 1})); - auto dO = graph->tensor(fe::graph::Tensor_attributes() - .set_name("dO") + auto dO = graph->tensor(fe::graph::Tensor_attributes().set_name("dO") .set_dim({B, NH, T, HS}) .set_stride({NH * HS * T, HS, NH * HS, 1})); - auto stats = graph->tensor(fe::graph::Tensor_attributes() - .set_name("stats") + auto stats = graph->tensor(fe::graph::Tensor_attributes().set_name("stats") .set_dim({B, NH, T, 1}) .set_stride({NH * T, T, 1, 1}) .set_data_type(fe::DataType_t::FLOAT)); - auto attn_scale = graph->tensor(fe::graph::Tensor_attributes() - .set_name("attn_scale") + auto attn_scale = graph->tensor(fe::graph::Tensor_attributes().set_name("attn_scale") .set_dim({1, 1, 1, 1}) .set_stride({1, 1, 1, 1}) .set_is_pass_by_value(true) .set_data_type(fe::DataType_t::FLOAT)); - auto sdpa_backward_options = fe::graph::SDPA_backward_attributes() - .set_name("flash_attention_backward") + auto sdpa_backward_options = fe::graph::SDPA_backward_attributes().set_name("flash_attention_backward") .set_causal_mask(true) .set_attn_scale(attn_scale); @@ -688,6 +589,16 @@ auto lookup_cache_or_build_graph_bwd(Args... args) { assert(graph->check_support(cudnn_handle).is_good()); assert(graph->build_plans(cudnn_handle).is_good()); + // Reallocate the workspace if the required size is greater than the current workspace + // By default, cuDNN uses up to 256MiB of workspace, so we don't want to just allocate the maximum + if (graph->get_workspace_size() > cudnn_workspace_size) { + if (cudnn_workspace_size > 0) { + cudaCheck(cudaFree(cudnn_workspace)); + } + cudnn_workspace_size = graph->get_workspace_size(); + cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size)); + } + auto tuple = std::make_tuple(graph, Q, K, V, O, dO, stats, attn_scale, dQ, dK, dV); user_maintained_cache_bwd.insert({key, tuple}); return tuple; @@ -721,16 +632,6 @@ void attention_forward_cudnn(floatX* out, // output: (B, T, NH, HS) variant_pack[softmax_stats] = stats; } - // Reallocate the workspace if the required size is greater than the current workspace - // By default, cuDNN uses up to 256MiB of workspace, so we don't want to just allocate the maximum - if (graph->get_workspace_size() > cudnn_workspace_size) { - if (cudnn_workspace_size > 0) { - cudaCheck(cudaFree(cudnn_workspace)); - } - cudnn_workspace_size = graph->get_workspace_size(); - cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size)); - } - // Execute graph assert(graph->execute(cudnn_handle, variant_pack, cudnn_workspace).is_good()); cudaCheck(cudaGetLastError()); @@ -765,16 +666,6 @@ void attention_backward_cudnn(floatX* dqkvr, {dQ, devPtrdQ}, {dK, devPtrdK}, {dV, devPtrdV}, {attn_scale, &attn_scale_cpu}}; - // Reallocate the workspace if the required size is greater than the current workspace - // By default, cuDNN uses up to 256MiB of workspace, so we don't want to just allocate the maximum - if (graph->get_workspace_size() > cudnn_workspace_size) { - if (cudnn_workspace_size > 0) { - cudaCheck(cudaFree(cudnn_workspace)); - } - cudnn_workspace_size = graph->get_workspace_size(); - cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size)); - } - // Execute graph assert(graph->execute(cudnn_handle, variant_pack, cudnn_workspace).is_good()); cudaCheck(cudaGetLastError()); @@ -789,51 +680,70 @@ __global__ void encoder_forward_kernel3(floatX* out, int B, int T, int C) { int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size; int N = B * T * C; - if (idx < N) { - int bt = idx / C; - int b = bt / T; - int t = bt % T; - int c = idx % C; - - int ix = inp[b * T + t]; - - floatX* out_btc = out + b * T * C + t * C + c; - const floatX* wte_ix = wte + ix * C + c; - const floatX* wpe_tc = wpe + t * C + c; - - x128 packed_out; - x128 wte = load128cs(wte_ix); - x128 wpe = load128cs(wpe_tc); - #pragma unroll - for (int k = 0; k < wte.size; k++) { - packed_out[k] = (floatX)((float)wte[k] + (float)wpe[k]); - } - store128(out_btc, packed_out); + if (idx >= N) { return; } + + int bt = idx / C; + int b = bt / T; + int t = bt % T; + int c = idx % C; + + int ix = inp[b * T + t]; + + floatX* out_btc = out + b * T * C + t * C + c; + const floatX* wte_ix = wte + ix * C + c; + const floatX* wpe_tc = wpe + t * C + c; + + x128 packed_out; + x128 wte128 = load128cs(wte_ix); + x128 wpe128 = load128cs(wpe_tc); + for (int k = 0; k < x128::size; k++) { + packed_out[k] = (floatX)((float)wte128[k] + (float)wpe128[k]); } + store128(out_btc, packed_out); +} + +__device__ void atomicStochasticAdd(__nv_bfloat16* address, float val0, float val1, uint seed) { + float2 val = make_float2(val0, val1); + uint* address_as_uint = (uint*)address; + uint old = *address_as_uint, assumed; + uint random = Get2dNoiseUint(threadIdx.x, blockIdx.x, seed); + do { + assumed = old; + float2 old_fp32 = __bfloat1622float2(*(__nv_bfloat162*)&old); + float2 new_fp32 = make_float2(old_fp32.x + val.x, old_fp32.y + val.y); + __nv_bfloat162 new_bf16; + stochastic_rounding(new_fp32.x, &new_bf16.x, random); + stochastic_rounding(new_fp32.y, &new_bf16.y, random >> 16); + old = atomicCAS(address_as_uint, assumed, *(uint*)&new_bf16); + } while (assumed != old); +} +__device__ void atomicStochasticAdd(float* address, float val0, float val1, uint seed) { + atomicAdd(address, val0); + atomicAdd(address + 1, val1); } -// really bad naive kernel with atomicAdd __global__ void encoder_backward_kernel(floatX* dwte, floatX* dwpe, const floatX* dout, const int* inp, - int B, int T, int C) { + int B, int T, int C, uint seed) { int idx = blockIdx.x * blockDim.x + threadIdx.x; int N = B * T * C; + idx *= 2; // 2 elements per thread + if (idx >= N) { return; } - if (idx < N) { - int bt = idx / C; - int b = bt / T; - int t = bt % T; - int c = idx % C; + int bt = idx / C; + int b = bt / T; + int t = bt % T; + int c = idx % C; - int ix = inp[b * T + t]; + int ix = inp[b * T + t]; - const floatX* dout_btc = dout + b * T * C + t * C + c; - floatX* dwte_ix = dwte + ix * C + c; - floatX* dwpe_tc = dwpe + t * C + c; + const floatX* dout_btc = dout + b * T * C + t * C + c; + floatX* dwte_ix = dwte + ix * C + c; + floatX* dwpe_tc = dwpe + t * C + c; - atomicAddX(dwte_ix, (floatX)*dout_btc); - atomicAddX(dwpe_tc, (floatX)*dout_btc); - } + float2 dout_data = make_float2(dout_btc[0], dout_btc[1]); + atomicStochasticAdd(dwte_ix, dout_data.x, dout_data.y, seed); + atomicStochasticAdd(dwpe_tc, dout_data.x, dout_data.y, seed ^ 0xFFFFFFFF); } __global__ void layernorm_forward_kernel3(floatX* __restrict__ out, floatX* __restrict__ mean, floatX* __restrict__ rstd, @@ -891,38 +801,38 @@ __global__ void permute_kernel(floatX* q, floatX* k, floatX* v, // okay so now, this kernel wants Q,K,V to all be of shape (B, NH, N, d) // but instead, we have a single tensor QKV (inp) of shape (B, N, 3, NH, d) int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= B * NH * N * d) { return; } + // Q[b][nh_][n][d_] = inp[b][n][0][nh_][d_] - if (idx < B * NH * N * d) { - int b = idx / (NH * N * d); - int rest = idx % (NH * N * d); - int nh_ = rest / (N * d); - rest = rest % (N * d); - int n = rest / d; - int d_ = rest % d; - int inp_idx = (b * N * 3 * NH * d) + (n * 3 * NH * d) + (0 * NH * d) + (nh_ * d) + d_; - q[idx] = __ldcs(&inp[inp_idx]); - k[idx] = __ldcs(&inp[inp_idx + NH * d]); - v[idx] = __ldcs(&inp[inp_idx + 2 * (NH * d)]); - } + int b = idx / (NH * N * d); + int rest = idx % (NH * N * d); + int nh_ = rest / (N * d); + rest = rest % (N * d); + int n = rest / d; + int d_ = rest % d; + int inp_idx = (b * N * 3 * NH * d) + (n * 3 * NH * d) + (0 * NH * d) + (nh_ * d) + d_; + q[idx] = __ldcs(&inp[inp_idx]); + k[idx] = __ldcs(&inp[inp_idx + NH * d]); + v[idx] = __ldcs(&inp[inp_idx + 2 * (NH * d)]); } __global__ void permute_kernel_backward(floatX* dinp, const floatX* dq, const floatX* dk, const floatX* dv, int B, int N, int NH, int d) { int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < B * NH * N * d) { - int b = idx / (NH * N * d); - int rest = idx % (NH * N * d); - int nh_ = rest / (N * d); - rest = rest % (N * d); - int n = rest / d; - int d_ = rest % d; + if (idx >= B * NH * N * d) { return; } - int inp_idx = (b * N * 3 * NH * d) + (n * 3 * NH * d) + (0 * NH * d) + (nh_ * d) + d_; - dinp[inp_idx] = dq[idx]; - dinp[inp_idx + NH * d] = dk[idx]; - dinp[inp_idx + 2 * (NH * d)] = dv[idx]; - } + int b = idx / (NH * N * d); + int rest = idx % (NH * N * d); + int nh_ = rest / (N * d); + rest = rest % (N * d); + int n = rest / d; + int d_ = rest % d; + + int inp_idx = (b * N * 3 * NH * d) + (n * 3 * NH * d) + (0 * NH * d) + (nh_ * d) + d_; + dinp[inp_idx] = dq[idx]; + dinp[inp_idx + NH * d] = dk[idx]; + dinp[inp_idx + 2 * (NH * d)] = dv[idx]; } __global__ void unpermute_kernel(floatX* inp, floatX *out, int B, int N, int NH, int d) { @@ -930,30 +840,30 @@ __global__ void unpermute_kernel(floatX* inp, floatX *out, int B, int N, int NH, int idx = (blockIdx.x * blockDim.x + threadIdx.x); // out[b][n][nh_][d_] <- inp[b][nh_][n][d_] - if (idx < B * NH * N * d) { - int b = idx / (NH * N * d); - int rest = idx % (NH * N * d); - int nh_ = rest / (N * d); - rest = rest % (N * d); - int n = rest / d; - int d_ = rest % d; - int other_idx = (b * NH * N * d) + (n * NH * d) + (nh_ * d) + d_; - out[other_idx] = __ldcs(&inp[idx]); - } + if (idx >= B * NH * N * d) { return; } + + int b = idx / (NH * N * d); + int rest = idx % (NH * N * d); + int nh_ = rest / (N * d); + rest = rest % (N * d); + int n = rest / d; + int d_ = rest % d; + int other_idx = (b * NH * N * d) + (n * NH * d) + (nh_ * d) + d_; + out[other_idx] = __ldcs(&inp[idx]); } __global__ void unpermute_kernel_backward(floatX* dinp, const floatX *dout, int B, int N, int NH, int d) { int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < B * NH * N * d) { - int b = idx / (NH * N * d); - int rest = idx % (NH * N * d); - int nh_ = rest / (N * d); - rest = rest % (N * d); - int n = rest / d; - int d_ = rest % d; - int other_idx = (b * NH * N * d) + (n * NH * d) + (nh_ * d) + d_; - dinp[idx] = (floatX)dout[other_idx]; - } + if (idx >= B * NH * N * d) { return; } + + int b = idx / (NH * N * d); + int rest = idx % (NH * N * d); + int nh_ = rest / (N * d); + rest = rest % (N * d); + int n = rest / d; + int d_ = rest % d; + int other_idx = (b * NH * N * d) + (n * NH * d) + (nh_ * d) + d_; + dinp[idx] = (floatX)dout[other_idx]; } __global__ void softmax_forward_kernel5(floatX* out, float inv_temperature, const floatX* inp, int N, int T) { @@ -983,13 +893,13 @@ __global__ void softmax_forward_kernel5(floatX* out, float inv_temperature, cons const floatX* x = inp + idx * T; // not INF, so we don't get NaNs accidentally when subtracting two values. + const float FLT_MAX = 340282346638528859811704183484516925440.0f; // to avoid including float.h float maxval = -FLT_MAX; float sumval = 0.0f; const floatX* x_aligned = reinterpret_cast(__builtin_assume_aligned(x, 16)); for (int i = lane_id; i < pos_by_4; i += warp_size) { float regarray[4]; - #pragma unroll for (int k = 0; k < 4; ++k) { regarray[k] = (float)x_aligned[4*i + k]; } @@ -1026,67 +936,61 @@ __global__ void softmax_forward_kernel5(floatX* out, float inv_temperature, cons __global__ void residual_forward_kernel(floatX* out, floatX* inp1, floatX* inp2, int N) { int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size; - if (idx < N) { - x128 packed_out; - x128 packed_inp1 = load128cs(inp1 + idx); - x128 packed_inp2 = load128cs(inp2 + idx); - #pragma unroll - for (int k = 0; k < packed_inp1.size; k++) { - packed_out[k] = (floatX)((float)packed_inp1[k] + (float)packed_inp2[k]); - } - store128(out + idx, packed_out); + if (idx >= N) { return; } + + x128 packed_out; + x128 packed_inp1 = load128cs(inp1 + idx); + x128 packed_inp2 = load128cs(inp2 + idx); + for (int k = 0; k < packed_inp1.size; k++) { + packed_out[k] = (floatX)((float)packed_inp1[k] + (float)packed_inp2[k]); } + store128(out + idx, packed_out); } #define GELU_SCALING_FACTOR sqrtf(2.0f / M_PI) __global__ void gelu_forward_kernel2(floatX* out, const floatX* inp, int N) { - int i = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size; - if (i < N) { - x128 packed_out; - x128 packed_inp = load128cs(inp + i); // load and do not keep in cache - for(int k = 0; k < packed_inp.size; ++k) { - float xi = (float)packed_inp[k]; - float cube = 0.044715f * xi * xi * xi; - packed_out[k] = (floatX)(0.5f * xi * (1.0f + tanhf(GELU_SCALING_FACTOR * (xi + cube)))); - } - // store instead of storecs (without cache streaming) in case it is useful for the - // data to be in the cache for the next operation after this GeLU - store128(out + i, packed_out); + int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size; + if (idx >= N) { return; } + + x128 packed_out; + x128 packed_inp = load128cs(inp + idx); // load and do not keep in cache + for(int k = 0; k < packed_inp.size; ++k) { + float xi = (float)packed_inp[k]; + float cube = 0.044715f * xi * xi * xi; + packed_out[k] = (floatX)(0.5f * xi * (1.0f + tanhf(GELU_SCALING_FACTOR * (xi + cube)))); } + // store instead of storecs (without cache streaming) in case it is useful for the + // data to be in the cache for the next operation after this GeLU + store128(out + idx, packed_out); } __global__ void gelu_backward_kernel(floatX* dinp, const floatX* inp, const floatX* dout, const int N) { - int i = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size; - if (i < N) { - x128 packed_dinp; - x128 packed_inp = load128cs(inp + i); - x128 packed_dout = load128cs(dout + i); - for (int k = 0; k < packed_inp.size; ++k) { - float x = (float)packed_inp[k]; - float cube = 0.044715f * x * x * x; - float tanh_arg = GELU_SCALING_FACTOR * (x + cube); - float tanh_out = tanhf(tanh_arg); - float coshf_out = coshf(tanh_arg); - float sech_out = 1.0f / (coshf_out * coshf_out); - float local_grad = 0.5f * (1.0f + tanh_out) + x * 0.5f * sech_out * GELU_SCALING_FACTOR * (1.0f + 3.0f * 0.044715f * x * x); - packed_dinp[k] = (floatX)(local_grad * (float)packed_dout[k]); - } + int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size; + if (idx >= N) { return; } - store128(dinp + i, packed_dinp); + x128 packed_dinp; + x128 packed_inp = load128cs(inp + idx); + x128 packed_dout = load128cs(dout + idx); + for (int k = 0; k < packed_inp.size; ++k) { + float x = (float)packed_inp[k]; + float cube = 0.044715f * x * x * x; + float tanh_arg = GELU_SCALING_FACTOR * (x + cube); + float tanh_out = tanhf(tanh_arg); + float coshf_out = coshf(tanh_arg); + float sech_out = 1.0f / (coshf_out * coshf_out); + float local_grad = 0.5f * (1.0f + tanh_out) + x * 0.5f * sech_out * GELU_SCALING_FACTOR * (1.0f + 3.0f * 0.044715f * x * x); + packed_dinp[k] = (floatX)(local_grad * (float)packed_dout[k]); } + store128(dinp + idx, packed_dinp); } __global__ void matmul_backward_bias_kernel6(float* dbias, const floatX* dout, int B, int T, int OC) { // note: this kernel reads in floatX, but it writes to float! // this is because we're using atomics, which are super slow in < fp32 precision on < H100 GPUs // so the trick is do fp32 atomics to a buffer, and then copy_and_cast the result to floatX + // (this also results in higher accuracy than doing doing accumulation directly in floatX) - // Each warp is responsible for 32 * "x128::size" = 256 OCs at BF16 (OC must be a multiple of 256!) - // Block size is 512 threads (16 warps) and we reduce those 16 values into 1 at the end - // blockDim.x is 32 --> single warp being responsible for those 256 OCs - // blockDim.y is 16 --> 16 parallel independent warps processing the same OCs for different BTs - // gridDim.x is OC / 256 --> each block processes 256 OCs - // grimDim.y is max(1, (cuda_num_SMs * cuda_threads_per_SM) / (512 * gridDim.x)); --> fill up the entire GPU! + // see comments in matmul_backward() for an explanation of block/grid dimensions etc. const int block_size = 512; const int block_size_x = 32; const int block_size_y = block_size / block_size_x; // 16 @@ -1101,26 +1005,23 @@ __global__ void matmul_backward_bias_kernel6(float* dbias, const floatX* dout, i accumulators[k] = 0.0f; } int thread_id = threadIdx.y * block_size_x + threadIdx.x; - for (int i = thread_id; i < OC_per_warp; i += block_size) { - shared[i] = 0.0f; + for (int idx = thread_id; idx < OC_per_warp; idx += block_size) { + shared[idx] = 0.0f; } __syncthreads(); - for (int i = blockIdx.y*block_size_y + threadIdx.y; i < B * T; i += gridDim.y*block_size_y) { - x128 packed_dout = load128(dout + global_oc + i*OC); + for (int idx = blockIdx.y*block_size_y + threadIdx.y; idx < B * T; idx += gridDim.y*block_size_y) { + x128 packed_dout = load128(dout + global_oc + idx*OC); for (int k = 0; k < x128::size; k++) { - //printf("%d: %f + %f\n", oc, accumulators[k], (float)packed_dout[k]); accumulators[k] += (float)packed_dout[k]; } - //__syncthreads(); // keep block synchronised to maximise memory locality (?) } for (int k = 0; k < x128::size; k++) { atomicAdd(shared + local_oc + k, accumulators[k]); } __syncthreads(); if (threadIdx.y == 0) { - for (int i = threadIdx.x; i < OC_per_warp; i += block_size_x) { - //printf("%d => %f\n", i, shared[i]); - atomicAdd(dbias + i + blockIdx.x*OC_per_warp, shared[i]); + for (int idx = threadIdx.x; idx < OC_per_warp; idx += block_size_x) { + atomicAdd(dbias + idx + blockIdx.x*OC_per_warp, shared[idx]); } } } @@ -1140,7 +1041,6 @@ __global__ void layernorm_backward_kernel7(floatX* dinp, floatX* dweight, floatX float* dweight_shared = shared + C; // init shared memory to zero - #pragma unroll 4 for(int i = threadIdx.x; i < C; i+= blockDim.x){ dbias_shared[i] = 0.0f; dweight_shared[i] = 0.0f; @@ -1167,12 +1067,11 @@ __global__ void layernorm_backward_kernel7(floatX* dinp, floatX* dweight, floatX dnorm_mean += dnorm_i; dnorm_norm_mean += dnorm_i * norm_bti; } - dnorm_mean = warpReduceSum(dnorm_mean); - dnorm_norm_mean = warpReduceSum(dnorm_norm_mean); - dnorm_mean = dnorm_mean / C; - dnorm_norm_mean = dnorm_norm_mean / C; + dnorm_mean = warpReduceSum(dnorm_mean) / C; + dnorm_norm_mean = warpReduceSum(dnorm_norm_mean) / C; // now iterate again and accumulate all the gradients + // todo - use x128 for this loop to improve performance for (int i = warpThreadIdx; i < C; i += warpSize) { float dout_i = (float)__ldcs(&dout_bt[i]); float norm_bti = ((float)__ldcs(&inp_bt[i]) - mean_bt) * rstd_bt; @@ -1193,7 +1092,7 @@ __global__ void layernorm_backward_kernel7(floatX* dinp, floatX* dweight, floatX // Accumulate into a FP32 scratchpad // BF16 atomics are potentially much slower... and this is more precise! - // todo - could potentially avoid the extra copy if floatX is FP32, fairly negligible though + // todo - could avoid the extra copy if floatX is FP32, fairly negligible though __syncthreads(); float* scratch_dbias = scratch; float* scratch_dweight = scratch + C; @@ -1221,9 +1120,9 @@ __global__ void softmax_autoregressive_backward_kernel(floatX* dpreatt, const fl constexpr const int BlockSize = 256; constexpr int T_per_block = 4; - int idx = blockIdx.y; // go through blocks in reverse order, so the slowest block starts first int t0 = T - 1 - T_per_block*blockIdx.x; + int idx = blockIdx.y; att += idx * T * T; datt += idx * T * T; @@ -1254,41 +1153,40 @@ __global__ void softmax_autoregressive_backward_kernel(floatX* dpreatt, const fl // Implements linear interpolation using only two floating-point operations (as opposed to three in a naive implementation). // Reference: https://developer.nvidia.com/blog/lerp-faster-cuda -__device__ inline float lerp(float start, float end, float weight) { +__device__ float lerp(float start, float end, float weight) { return fma(weight, end, fma(-weight, start, start)); } -// Termplate type T instead of floatx template __global__ void adamw_kernel3(Tp* params_memory, float* master_params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters, float learning_rate, float beta1, float beta2, float beta1_correction, float beta2_correction, float eps, float weight_decay, unsigned int seed) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= num_parameters) return; // guard + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= num_parameters) { return; } // guard // get the gradient, m, and v for this parameter - float grad = (float)grads_memory[i]; - float m = m_memory[i]; - float v = v_memory[i]; + float grad = (float)grads_memory[idx]; + float m = m_memory[idx]; + float v = v_memory[idx]; // update the first moment (momentum) m = lerp(grad, m, beta1); - m_memory[i] = m; + m_memory[idx] = m; // update the second moment (RMSprop) v = lerp(grad * grad, v, beta2); - v_memory[i] = v; + v_memory[idx] = v; m /= beta1_correction; // m_hat v /= beta2_correction; // v_hat // fetch the old value of this parameter as a float, from either source - float old_param = (master_params_memory != NULL) ? master_params_memory[i] : (float)params_memory[i]; + float old_param = (master_params_memory != NULL) ? master_params_memory[idx] : (float)params_memory[idx]; // update this parameter float param = old_param - (learning_rate * (m / (sqrtf(v) + eps) + weight_decay * old_param)); // update our low precision version of the parameters using stochastic rounding // this will be used in the next forward pass // TODO: simply doing `params_memory[i] = (floatX)param;` breaks everything (why?) unsigned int random = Get2dNoiseUint(threadIdx.x, blockIdx.x, seed); - stochastic_rounding(param, ¶ms_memory[i], random); + stochastic_rounding(param, ¶ms_memory[idx], random); // write the full, float version of the param into our master copy, if we maintain one // this will be used in the next update - if (master_params_memory != NULL) { master_params_memory[i] = param; } + if (master_params_memory != NULL) { master_params_memory[idx] = param; } } struct SoftmaxParams { @@ -1378,14 +1276,14 @@ __global__ void fused_classifier_kernel3(floatX* logits, floatX* losses, floatX* __global__ void copy_and_cast_kernel(float* dst, const floatX* src, size_t n) { // a small kernel to copy and cast, i.e. `dst <- (float) src` - const size_t i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n) { dst[i] = (float)src[i]; } + const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { dst[idx] = (float)src[idx]; } } __global__ void cast_and_add_kernel(floatX* dst, const float* src, size_t n) { // used only for matmul_backward_bias kernel, a little bit embarassing TODO delete later - const size_t i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n) { dst[i] += (floatX)src[i]; } // have to += because dbias is a paramater + const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { dst[idx] += (floatX)src[idx]; } // have to += because dbias is a paramater } // ---------------------------------------------------------------------------- @@ -1404,12 +1302,12 @@ void encoder_forward(floatX* out, void encoder_backward(floatX* dwte, floatX* dwpe, const floatX* dout, const int* inp, - int B, int T, int C) { + int B, int T, int C, uint seed) { NVTX_RANGE_FN(); const int N = B * T * C; const int block_size = 256; - const int grid_size = CEIL_DIV(N, block_size); - encoder_backward_kernel<<>>(dwte, dwpe, dout, inp, B, T, C); + const int grid_size = CEIL_DIV(N, block_size * 2); // each thread handles 2 elements + encoder_backward_kernel<<>>(dwte, dwpe, dout, inp, B, T, C, seed); cudaCheck(cudaGetLastError()); } @@ -1424,9 +1322,7 @@ void layernorm_forward(floatX* out, floatX* mean, floatX* rstd, cudaCheck(cudaGetLastError()); } -// uses cuBLASLt to fuse the bias and gelu. does not work with OC = 50257 (last layer) // https://docs.nvidia.com/cuda/cublas/#cublasltmatmul -// https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuBLASLt/LtSgemm/sample_cublasLt_LtSgemm.cu void matmul_forward_cublaslt(floatX* out, floatX* inp, floatX* weight, floatX* bias, int B, int T, int C, int OC) { @@ -1439,13 +1335,8 @@ void matmul_forward_cublaslt(floatX* out, exit(EXIT_FAILURE); } - // FP16 alpha/beta need to be used if and only if CUBLAS_COMPUTE_16F + // these need to be in FP16 if and only if alpha/beta are CUBLAS_COMPUTE_16F const float alpha = 1.0f, beta = 0.0f; - const half alpha_fp16 = (half)alpha, beta_fp16 = (half)beta; - const void* alpha_ptr = (CUBLAS_LOWP_COMPUTE == CUBLAS_COMPUTE_16F) ? - (const void*)&alpha_fp16 : (const void*)α - const void* beta_ptr = (CUBLAS_LOWP_COMPUTE == CUBLAS_COMPUTE_16F) ? - (const void*)&beta_fp16 : (const void*)β int returnedResults = 0; cublasLtMatmulDesc_t operationDesc; @@ -1459,16 +1350,12 @@ void matmul_forward_cublaslt(floatX* out, // create the operation descriptor cublasOperation_t opNoTranspose = CUBLAS_OP_N; cublasOperation_t opTranspose = CUBLAS_OP_T; - cublasLtEpilogue_t epilogueBias = CUBLASLT_EPILOGUE_BIAS; + cublasLtEpilogue_t epilogueBias = has_bias ? CUBLASLT_EPILOGUE_BIAS : CUBLASLT_EPILOGUE_DEFAULT; - cudaDataType_t scale_type = (CUBLAS_LOWP_COMPUTE == CUBLAS_COMPUTE_16F) ? CUDA_R_16F : CUDA_R_32F; - cublasCheck(cublasLtMatmulDescCreate(&operationDesc, CUBLAS_LOWP_COMPUTE, scale_type)); + cublasCheck(cublasLtMatmulDescCreate(&operationDesc, cublas_compute, CUDA_R_32F)); // FP16 if CUBLAS_COMPUTE_16F cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &opTranspose, sizeof(opTranspose))); cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opNoTranspose, sizeof(opNoTranspose))); - if(has_bias) { - cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogueBias, - sizeof(epilogueBias))); - } + cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogueBias, sizeof(epilogueBias))); cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias))); // define matrix layouts @@ -1480,8 +1367,7 @@ void matmul_forward_cublaslt(floatX* out, // create a preference handle with specified max workspace cublasCheck(cublasLtMatmulPreferenceCreate(&preference)); cublasCheck(cublasLtMatmulPreferenceSetAttribute(preference, - CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, - &cublaslt_workspace_size, sizeof(cublaslt_workspace_size))); + CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &cublaslt_workspace_size, sizeof(cublaslt_workspace_size))); // find a suitable algorithm cublasCheck(cublasLtMatmulAlgoGetHeuristic(cublaslt_handle, operationDesc, @@ -1494,7 +1380,7 @@ void matmul_forward_cublaslt(floatX* out, // call the matmul cublasCheck(cublasLtMatmul(cublaslt_handle, operationDesc, - alpha_ptr, weight, weightLayout, inp, inputLayout, beta_ptr, + &alpha, weight, weightLayout, inp, inputLayout, &beta, out, outputLayout, out, outputLayout, &heuristic.algo, cublaslt_workspace, cublaslt_workspace_size, main_stream)); @@ -1514,7 +1400,7 @@ void attention_forward(floatX* out, floatX* qkvr, floatX* att, // Note: `inp` is not needed for backward pass, so we re-use it as a scratch buffer. // Its contents will be overwritten by this function. const int block_size = 256; - const int softmax_block_size = 256; + const float alpha = 1.0f, beta = 0.0f; // inp is (B, T, 3C) QKV // preatt, att are (B, NH, T, T) @@ -1529,50 +1415,32 @@ void attention_forward(floatX* out, floatX* qkvr, floatX* att, int total_threads = B * NH * T * HS; int num_blocks = CEIL_DIV(total_threads, block_size); permute_kernel<<>>(q, k, v, inp, B, T, NH, HS); - cudaCheck(cudaGetLastError()); - // IMPORTANT: alpha/beta are FP32 for CUBLAS_COMPUTE_32F even if FP16 inputs/outputs - // But need FP16 scale for CUBLAS_COMPUTE_16F (no errors otherwise, just garbage results *sigh*) - const float alpha = 1.0f; - const float beta = 0.0f; - const floatX alpha_lowp = (floatX)alpha; - const floatX beta_lowp = (floatX)beta; - void* alpha_ptr = (CUBLAS_LOWP_COMPUTE == CUBLAS_COMPUTE_16F) ? (void*)&alpha_lowp : (void*)α - void* beta_ptr = (CUBLAS_LOWP_COMPUTE == CUBLAS_COMPUTE_16F) ? (void*)&beta_lowp : (void*)β floatX* preatt = inp; cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, - T, T, HS, - alpha_ptr, + T, T, HS, &alpha, k, CUBLAS_LOWP, HS, T * HS, q, CUBLAS_LOWP, HS, T * HS, - beta_ptr, - preatt, CUBLAS_LOWP, T, T * T, - B * NH, - CUBLAS_LOWP_COMPUTE, - CUBLAS_GEMM_DEFAULT)); + &beta, preatt, CUBLAS_LOWP, T, T * T, + B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT)); // multiply all elements of preatt elementwise by scale float scale = 1.0 / sqrtf(HS); - int grid_size = CEIL_DIV(B * NH * T * 32, softmax_block_size); - softmax_forward_kernel5<<>>(att, scale, preatt, B * NH, T); - cudaCheck(cudaGetLastError()); + int grid_size = CEIL_DIV(B * NH * T * 32, block_size); + softmax_forward_kernel5<<>>(att, scale, preatt, B * NH, T); // new approach: first cuBLAS another batched matmul floatX* vaccum = inp; // y = att @ v # (B, nh, T, T) @ (B, nh, T, hs) -> (B, nh, T, hs) cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, - HS, T, T, - alpha_ptr, + HS, T, T, &alpha, v, CUBLAS_LOWP, HS, T * HS, att, CUBLAS_LOWP, T, T * T, - beta_ptr, - vaccum, CUBLAS_LOWP, HS, T * HS, - B * NH, - CUBLAS_LOWP_COMPUTE, - CUBLAS_GEMM_DEFAULT)); + &beta, vaccum, CUBLAS_LOWP, HS, T * HS, + B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT)); // now unpermute // y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side @@ -1610,8 +1478,7 @@ void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias, float* dbias_buffer, int B, int T, int C, int OC) { NVTX_RANGE_FN(); - float one = 1.0f; - float zero = 0.0f; + float one = 1.0f, zero = 0.0f; // backward to bias, if given, does a += if (dbias != NULL) { @@ -1620,14 +1487,15 @@ void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias, // blockDim.x is 32 --> single warp being responsible for those 256 OCs // blockDim.y is 16 --> 16 parallel independent warps processing the same OCs for different BTs // gridDim.x is OC / 256 --> each block processes 256 OCs - // grimDim.y is max(1, (cuda_num_SMs * cuda_threads_per_SM) / (512 * gridDim.x)); --> fill up the entire GPU! + // grimDim.y is max(1, (cuda_num_SMs * threads_per_SM) / (512 * gridDim.x)); --> fill up the entire GPU! const int warp_size = 32; const int block_size = 512; const int OC_per_warp = warp_size * x128::size; // 256 at BF16 const int block_size_x = 32; const int block_size_y = block_size / block_size_x; // 16 const int grid_size_x = OC / OC_per_warp; // e.g. 3 horizontal blocks for 768 OCs at BF16 - const int grid_size_y = max(1, cuda_threads_per_SM * cuda_num_SMs / (block_size * grid_size_x)); // full GPU! + const int grid_size_y = max(1, deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount + / (block_size * grid_size_x)); // full GPU! assert((OC % OC_per_warp) == 0); // there is no bounds checking in the kernel to maximise performance @@ -1636,17 +1504,17 @@ void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias, dim3(block_size_x, block_size_y), OC_per_warp * sizeof(float), main_stream>>>(dbias_buffer, dout, B, T, OC); cast_and_add_kernel<<>>(dbias, dbias_buffer, OC); - cudaCheck(cudaGetLastError()); } // backward to input, uses = in the backward pass (set the gradient) cublasCheck(cublasGemmEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, C, B*T, OC, &one, weight, CUBLAS_LOWP, C, dout, CUBLAS_LOWP, OC, &zero, - dinp, CUBLAS_LOWP, C, CUBLAS_LOWP_COMPUTE, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // backward to weight, uses += in the backward pass (accumulate the gradient) + dinp, CUBLAS_LOWP, C, cublas_compute, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // backward to weight, uses += in the backward pass (accumulate the gradient) by setting alpha=one cublasCheck(cublasGemmEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_T, C, OC, B*T, &one, inp, CUBLAS_LOWP, C, dout, CUBLAS_LOWP, OC, &one, - dweight, CUBLAS_LOWP, C, CUBLAS_LOWP_COMPUTE, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + dweight, CUBLAS_LOWP, C, cublas_compute, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + cudaCheck(cudaGetLastError()); } void layernorm_backward(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch, @@ -1654,11 +1522,10 @@ void layernorm_backward(floatX* dinp, floatX* dweight, floatX* dbias, float* scr int B, int T, int C) { NVTX_RANGE_FN(); const int block_size = 1024; - const int grid_size = 1 * cuda_num_SMs; + const int grid_size = deviceProp.multiProcessorCount; size_t shared_mem_size = (2 * C + 1) * sizeof(float); cudaMemsetAsync(scratch, 0, (2 * C + 1) * sizeof(float), main_stream); - layernorm_backward_kernel7<<>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C); cudaCheck(cudaGetLastError()); } @@ -1672,14 +1539,7 @@ void attention_backward(floatX* dinp, floatX* dqkvr, floatX* dpreatt, floatX* da NVTX_RANGE_FN(); const int block_size = 256; int HS = C / NH; // head size - - // FP16 alpha/beta need to be used if and only if CUBLAS_COMPUTE_16F const float alpha = 1.0f, beta = 0.0f; - const half alpha_fp16 = (half)alpha, beta_fp16 = (half)beta; - const void* alpha_ptr = (CUBLAS_LOWP_COMPUTE == CUBLAS_COMPUTE_16F) ? - (const void*)&alpha_fp16 : (const void*)α - const void* beta_ptr = (CUBLAS_LOWP_COMPUTE == CUBLAS_COMPUTE_16F) ? - (const void*)&beta_fp16 : (const void*)β // unpack convenience pointers into q, k, v const floatX *q, *k, *v; @@ -1694,31 +1554,26 @@ void attention_backward(floatX* dinp, floatX* dqkvr, floatX* dpreatt, floatX* da // backward through the unpermute operation int num_blocks = CEIL_DIV(B * T * C, block_size); unpermute_kernel_backward<<>>(scratch, dout, B, T, NH, HS); - cudaCheck(cudaGetLastError()); // backward into datt - - cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, T, T, HS, alpha_ptr, - v, CUBLAS_LOWP, HS, T * HS, scratch, CUBLAS_LOWP, HS, T * HS, beta_ptr, - datt, CUBLAS_LOWP, T, T * T, B * NH, CUBLAS_LOWP_COMPUTE, CUBLAS_GEMM_DEFAULT)); - + cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, T, T, HS, &alpha, + v, CUBLAS_LOWP, HS, T * HS, scratch, CUBLAS_LOWP, HS, T * HS, &beta, + datt, CUBLAS_LOWP, T, T * T, B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT)); // backward into dv - cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_T, HS, T, T, alpha_ptr, - scratch, CUBLAS_LOWP, HS, T * HS, att, CUBLAS_LOWP, T, T * T, beta_ptr, - dv, CUBLAS_LOWP, HS, T * HS, B * NH, CUBLAS_LOWP_COMPUTE, CUBLAS_GEMM_DEFAULT)); - + cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_T, HS, T, T, &alpha, + scratch, CUBLAS_LOWP, HS, T * HS, att, CUBLAS_LOWP, T, T * T, &beta, + dv, CUBLAS_LOWP, HS, T * HS, B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT)); // backward into preatt int hs = C / NH; // head size float scale = 1.0f / sqrtf(hs); softmax_autoregressive_backward_kernel<<>>(dpreatt, datt, att, B, T, C, scale); - cudaCheck(cudaGetLastError()); // backward into q - cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, HS, T, T, alpha_ptr, - k, CUBLAS_LOWP, HS, T * HS, dpreatt, CUBLAS_LOWP, T, T * T, beta_ptr, - dq, CUBLAS_LOWP, HS, T * HS, B * NH, CUBLAS_LOWP_COMPUTE, CUBLAS_GEMM_DEFAULT)); + cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, HS, T, T, &alpha, + k, CUBLAS_LOWP, HS, T * HS, dpreatt, CUBLAS_LOWP, T, T * T, &beta, + dq, CUBLAS_LOWP, HS, T * HS, B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT)); // backward into k - cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_T, HS, T, T, alpha_ptr, - q, CUBLAS_LOWP, HS, T * HS, dpreatt, CUBLAS_LOWP, T, T * T, beta_ptr, - dk, CUBLAS_LOWP, HS, T * HS, B * NH, CUBLAS_LOWP_COMPUTE, CUBLAS_GEMM_DEFAULT)); + cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_T, HS, T, T, &alpha, + q, CUBLAS_LOWP, HS, T * HS, dpreatt, CUBLAS_LOWP, T, T * T, &beta, + dk, CUBLAS_LOWP, HS, T * HS, B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT)); // backward into inp num_blocks = CEIL_DIV(B * NH * T * HS, block_size); permute_kernel_backward<<>>(dinp, dq, dk, dv, B, T, NH, HS); @@ -1814,9 +1669,9 @@ void* malloc_and_point_parameters(ParameterTensors* params, size_t* param_elemen cudaCheck(cudaMalloc((void**)¶ms_memory, num_parameters_bytes)); // assign all the tensors their place in the array floatX** ptrs[] = { - ¶ms->wte, ¶ms->wpe, (floatX**)¶ms->ln1w, (floatX**)¶ms->ln1b, ¶ms->qkvw, ¶ms->qkvb, - ¶ms->attprojw, ¶ms->attprojb, (floatX**)¶ms->ln2w, (floatX**)¶ms->ln2b, ¶ms->fcw, ¶ms->fcb, - ¶ms->fcprojw, ¶ms->fcprojb, (floatX**)¶ms->lnfw, (floatX**)¶ms->lnfb + ¶ms->wte, ¶ms->wpe, ¶ms->ln1w, ¶ms->ln1b, ¶ms->qkvw, ¶ms->qkvb, + ¶ms->attprojw, ¶ms->attprojb, ¶ms->ln2w, ¶ms->ln2b, ¶ms->fcw, ¶ms->fcb, + ¶ms->fcprojw, ¶ms->fcprojb, ¶ms->lnfw, ¶ms->lnfb }; char* params_memory_iterator = (char*)params_memory; for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) { @@ -1826,7 +1681,7 @@ void* malloc_and_point_parameters(ParameterTensors* params, size_t* param_elemen return params_memory; } -#define NUM_ACTIVATION_TENSORS 21 +#define NUM_ACTIVATION_TENSORS 20 typedef struct { floatX* encoded; // (B, T, C) floatX* ln1; // (L, B, T, C) @@ -1846,7 +1701,6 @@ typedef struct { floatX* lnf; // (B, T, C) floatX* lnf_mean; // (B, T) floatX* lnf_rstd; // (B, T) - floatX* losses; // (B, T) // todo - no longer used as GPU writes directly to cpu_losses // adding these two compared to the CPU .c code, needed for attention kernel as buffers floatX* qkvr; // (L, B, T, 3*C) // in inference mode, this buffer will store the logits @@ -1885,9 +1739,8 @@ void fill_in_activation_sizes(size_t* act_sizes, size_t B, size_t T, GPT2Config act_sizes[15] = B * T * C; // lnf act_sizes[16] = B * T; // lnf_mean act_sizes[17] = B * T; // lnf_rstd - act_sizes[18] = B * T; // losses - act_sizes[19] = L * B * T * 3*C; // qkvr - act_sizes[20] = B * T * max(3*C, max(NH*T, Vp)); // output / scratch + act_sizes[18] = L * B * T * 3*C; // qkvr + act_sizes[19] = B * T * max(3*C, max(NH*T, Vp)); // output / scratch } // Backward pass is conceptually quite different from forward, because we can discard @@ -1938,7 +1791,7 @@ void* malloc_and_point_activations(ActivationTensors* acts, const size_t* act_si &acts->encoded, &acts->ln1, &acts->ln1_mean, &acts->ln1_rstd, &acts->atty, &acts->att, &acts->attproj, &acts->residual2, &acts->ln2, &acts->ln2_mean, &acts->ln2_rstd, &acts->fch, &acts->fch_gelu, &acts->fcproj, &acts->residual3, &acts->lnf, - &acts->lnf_mean, &acts->lnf_rstd, &acts->losses, &acts->qkvr, &acts->output + &acts->lnf_mean, &acts->lnf_rstd, &acts->qkvr, &acts->output }; return malloc_and_point(ptrs, act_sizes, NUM_ACTIVATION_TENSORS); } @@ -2358,7 +2211,7 @@ void gpt2_backward(GPT2 *model) { // layernorm backward does += to dresidual, so it correctly accumulates gradient for the Attention block above layernorm_backward(dresidual, dl_ln1w, dl_ln1b, scratchF, dl_btc, residual, l_ln1w, l_ln1_mean, l_ln1_rstd, B, T, C); } - encoder_backward(grads.wte, grads.wpe, dresidual, model->inputs, B, T, C); + encoder_backward(grads.wte, grads.wpe, dresidual, model->inputs, B, T, C, random_u32(&model->rng_state)); // accumulate the loss, this was calculated at the end of gpt2_forward() cudaCheck(cudaEventSynchronize(loss_event)); // hopefully finished long ago @@ -2441,9 +2294,61 @@ void gpt2_free(GPT2 *model) { cudaFreeHost(model->cpu_losses); } +// ---------------------------------------------------------------------------- +// common init & free code for train/test/profile +void common_start(bool override_enable_tf32 = true) { + int deviceIdx = 0; + cudaCheck(cudaSetDevice(deviceIdx)); + cudaGetDeviceProperties(&deviceProp, deviceIdx); + printf("[System]\n"); + printf("Device %d: %s\n", deviceIdx, deviceProp.name); + + cudaCheck(cudaStreamCreate(&main_stream)); + cudaEventCreateWithFlags(&main_event, cudaEventDisableTiming); + cudaEventCreateWithFlags(&loss_event, cudaEventDisableTiming); + for (int i = 0; i < num_parallel_streams; i++) { + cudaCheck(cudaStreamCreate(¶llel_streams[i])); + cudaEventCreateWithFlags(¶llel_events[i], cudaEventDisableTiming); + } + + // set up cuBLAS and cuBLASLt (and cuDNN if enabled) + cublasCheck(cublasCreate(&cublas_handle)); + cublasCheck(cublasSetStream(cublas_handle, main_stream)); + cublasCheck(cublasLtCreate(&cublaslt_handle)); + cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size)); + #ifdef ENABLE_CUDNN + checkCudnnErr(cudnnCreate(&cudnn_handle)); + #endif + // TF32 precision is equivalent to torch.set_float32_matmul_precision('high') + bool enable_tf32 = PRECISION_MODE == PRECISION_FP32 && deviceProp.major >= 8 && override_enable_tf32; + cublasCheck(cublasSetMathMode(cublas_handle, enable_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH)); + cublas_compute = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F; + + // setup the (global) cuBLASLt workspace + cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size)); +} + +void common_free(GPT2 &model) { + cudaCheck(cudaEventDestroy(main_event)); + cudaCheck(cudaEventDestroy(loss_event)); + for (int i = 0; i < num_parallel_streams; i++) { + cudaCheck(cudaStreamDestroy(parallel_streams[i])); + cudaCheck(cudaEventDestroy(parallel_events[i])); + } + cudaCheck(cudaStreamDestroy(main_stream)); + + gpt2_free(&model); + #ifdef ENABLE_CUDNN + if (cudnn_workspace != NULL) { cudaCheck(cudaFree(cudnn_workspace)); } + checkCudnnErr(cudnnDestroy(cudnn_handle)); + #endif + cudaCheck(cudaFree(cublaslt_workspace)); + cublasCheck(cublasDestroy(cublas_handle)); + cublasCheck(cublasLtDestroy(cublaslt_handle)); +} + #ifndef TESTING // if we are TESTING (see test_gpt2.cu), we'll skip the int main below - // ---------------------------------------------------------------------------- // data loader lite: returns random batches of data from a file of integers @@ -2656,45 +2561,14 @@ int main(int argc, char *argv[]) { printf0("| use_master_weights | %-50s |\n", use_master_weights ? "enabled" : "disabled"); printf0("+-----------------------+----------------------------------------------------+\n"); - // set up the device - cudaCheck(cudaSetDevice(multi_gpu_config.local_device_idx)); - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, multi_gpu_config.local_device_idx); - cuda_num_SMs = deviceProp.multiProcessorCount; - cuda_threads_per_SM = deviceProp.maxThreadsPerMultiProcessor; - cuda_arch_major = deviceProp.major; - cuda_arch_minor = deviceProp.minor; + common_start(override_enable_tf32); // common init code for train/test/profile - cudaCheck(cudaStreamCreate(&main_stream)); - cudaEventCreateWithFlags(&main_event, cudaEventDisableTiming); - cudaEventCreateWithFlags(&loss_event, cudaEventDisableTiming); - for (int i = 0; i < num_parallel_streams; i++) { - cudaCheck(cudaStreamCreate(¶llel_streams[i])); - cudaEventCreateWithFlags(¶llel_events[i], cudaEventDisableTiming); - } - - // set up cuBLAS and cuBLASLt - cublasCheck(cublasCreate(&cublas_handle)); - cublasCheck(cublasSetStream(cublas_handle, main_stream)); - cublasCheck(cublasLtCreate(&cublaslt_handle)); - cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size)); - // setup compute precision settings for cublas - // TF32 precision is equivalent to torch.set_float32_matmul_precision('high') - int enable_tf32 = cuda_arch_major >= 8 ? 1 : 0; - if (override_enable_tf32 == 0) { enable_tf32 = 0; } // force to zero via arg - cublas_compute_type = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F; - cublasMath_t cublas_math_mode = enable_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH; - cublasCheck(cublasSetMathMode(cublas_handle, cublas_math_mode)); - if(cublas_compute_type); // unused in BF16 mode, avoid warning - - // set up cuDNN - #ifdef ENABLE_CUDNN - checkCudnnErr(cudnnCreate(&cudnn_handle)); - #endif + const char* precision_str = (PRECISION_MODE == PRECISION_FP32) + ? (cublas_compute == CUBLAS_COMPUTE_32F_FAST_TF32 ? "TF32" : "FP32") + : (PRECISION_MODE == PRECISION_FP16 ? "FP16" : "BF16"); printf0("| device | %-50s |\n", deviceProp.name); - printf0("| TF32 | %-50s |\n", enable_tf32 ? "enabled" : "disabled"); - printf0("| precision | %-50s |\n", precision_mode_str); + printf0("| precision | %-50s |\n", precision_str); printf0("+-----------------------+----------------------------------------------------+\n"); // build the GPT-2 model from a checkpoint @@ -2712,17 +2586,15 @@ int main(int argc, char *argv[]) { printf0("+-----------------------+----------------------------------------------------+\n"); // build DataLoaders for both train and val - char train_tokens_filename[128]; - char val_tokens_filename[128]; + char train_tokens_filename[128], val_tokens_filename[128]; assert(strlen(input_dataset_prefix) < 100); // being bit lazy here, make sure we don't overflow // if we're only overfitting a single batch for debugging, let's overfit the first batch // from val instead of train split, because val is smaller and a bit faster const char* train_split = (overfit_single_batch == 1) ? "val" : "train"; sprintf(train_tokens_filename, "%s_%s.bin", input_dataset_prefix, train_split); sprintf(val_tokens_filename, "%s_val.bin", input_dataset_prefix); - DataLoader train_loader; + DataLoader train_loader, val_loader; dataloader_init(&train_loader, &multi_gpu_config, train_tokens_filename, B, T); - DataLoader val_loader; dataloader_init(&val_loader, &multi_gpu_config, val_tokens_filename, B, T); int train_num_batches = (max_steps == -1) ? train_loader.num_batches : max_steps; // default = 1 epoch int val_num_batches = train_loader.num_batches < val_max_batches ? train_loader.num_batches : val_max_batches; @@ -2738,11 +2610,9 @@ int main(int argc, char *argv[]) { printf0("num_parameters: %zu ==> bytes: %zu\n", model.num_parameters, model.num_parameters_bytes); printf0("allocated %d MiB for model parameters\n", (int)round(model.num_parameters_bytes / (1024 * 1024))); - // set up the Logger + // set up the Logger & Tokenizer Logger logger; logger_init(&logger, output_log_file); - - // build the Tokenizer Tokenizer tokenizer; tokenizer_init(&tokenizer, "gpt2_tokenizer.bin"); @@ -2870,20 +2740,13 @@ int main(int argc, char *argv[]) { dataloader_free(&train_loader); dataloader_free(&val_loader); tokenizer_free(&tokenizer); - gpt2_free(&model); free(cpu_logits_raw); free(cpu_logits); free(gen_tokens); - #ifdef ENABLE_CUDNN - if (cudnn_workspace != NULL) { cudaCheck(cudaFree(cudnn_workspace)); } - checkCudnnErr(cudnnDestroy(cudnn_handle)); - #endif - cudaCheck(cudaFree(cublaslt_workspace)); - cublasCheck(cublasDestroy(cublas_handle)); - cublasCheck(cublasLtDestroy(cublaslt_handle)); logger_free(&logger); multi_gpu_config_free(&multi_gpu_config); + common_free(model); return 0; } #endif From abaaceb8011feb95ac402c0b5d152d3d65918f9a Mon Sep 17 00:00:00 2001 From: ademeure Date: Sat, 4 May 2024 23:43:13 +0100 Subject: [PATCH 010/172] Added makefile gencode changes --- Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 06923801d..a755c309f 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,9 @@ CUDA_OUTPUT_FILE = -o $@ # NVCC flags # -t=0 is short for --threads, 0 = number of CPUs on the machine -NVCC_FLAGS = -O3 -t=0 --use_fast_math +# include PTX for both SM52 (Maxwell) and SM80 (Ampere, our main optimisation target at the moment) + native SASS for current GPU +# this increases compile time by ~5%, but we need >=SM70 PTX for some optimisations, and it allows "cuobjdump --dump-sass" to work +NVCC_FLAGS = -O3 -t=0 --use_fast_math -gencode=arch=compute_52,code=compute_52 -gencode=arch=compute_80,code=compute_80 -arch=native NVCC_LDFLAGS = -lcublas -lcublasLt NVCC_INCLUDES = NVCC_LDLIBS = From 18d7ed92429756427b2af4483cc8f2203a4ebe35 Mon Sep 17 00:00:00 2001 From: ademeure Date: Sat, 4 May 2024 23:46:39 +0100 Subject: [PATCH 011/172] revert profile.py changes for now --- profile_gpt2cu.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/profile_gpt2cu.py b/profile_gpt2cu.py index 8e15b7dc2..b3eec863a 100644 --- a/profile_gpt2cu.py +++ b/profile_gpt2cu.py @@ -31,7 +31,7 @@ "dram__bytes_write.sum", # DRAM writes "lts__t_sectors_srcunit_tex_op_read.sum", # L2 reads (sectors -- 32B) "lts__t_sectors_srcunit_tex_op_write.sum", # L2 reads (sectors -- 32B) - "sm__pipe_tensor_op_hmma_cycles_active.avg.pct_of_peak_sustained_active", # todo - tensor core % + "smsp__inst_executed.sum", # instructions ] cmd = [NCU, "-i", "profile.ncu-rep", "--csv", "--page", "raw", "--metrics", ",".join(metrics)] result = subprocess.check_output(cmd, text=True).strip() @@ -55,11 +55,11 @@ for rid, row in enumerate(reader): if rid == 0: # headings - print(f"id pass {'name':<70} {'time':>8} {'RAM BW':>8} {'RAM rd':>8} {'RAM wt':>8} {'L2 rd':>8} {'L2 wt':>8} {'inst':>8}") + print(f"id pass {'name':<40} {'time':>8} {'RAM rd':>8} {'RAM wt':>8} {'L2 rd':>8} {'L2 wt':>8} {'inst':>8}") continue if rid == 1: # units - units = f" {'':<70} {'ms':>8} {'GB/s':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'MInst':>8}" + units = f" {'':<40} {'ms':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'MInst':>8}" print(units) print("." * len(units)) continue @@ -74,7 +74,7 @@ write = float(row[12]) l2_read = float(row[14]) l2_write = float(row[15]) - inst = float(row[16]) + inst = float(row[16]) / 1e6 kid = rid - 2 @@ -118,21 +118,18 @@ total['l2_write'] += l2_write total['inst'] += inst - dram_bw = (read + write) / (time / 1000.0); - - print(f"{kid:02} {pass_name:4} {fn_name:<70} {time:8.2f} {dram_bw:8.1f} {read:8.2f} {write:8.2f} {l2_read:8.2f} {l2_write:8.2f} {inst:8.2f}") + print(f"{kid:02} {pass_name:4} {fn_name:<40} {time:8.2f} {read:8.2f} {write:8.2f} {l2_read:8.2f} {l2_write:8.2f} {inst:8.2f}") total_time = total['time'] -total_dram_bw = (total['read'] + total['write']) / (total_time / 1000.0); print("." * len(units)) -print(f" {'Total':<70} {total['time']:8.2f} {total_dram_bw:8.1f} {total['read']:8.2f} {total['write']:8.2f} {total['l2_read']:8.2f} {total['l2_write']:8.2f} {total['inst']:8.2f}") +print(f" {'Total':<40} {total['time']:8.2f} {total['read']:8.2f} {total['write']:8.2f} {total['l2_read']:8.2f} {total['l2_write']:8.2f} {total['inst']:8.2f}") print() print("Kernel type summaries:") -print(f" {'name':<70} {'time':>6} {'frac':>6}") +print(f" {'name':<40} {'time':>6} {'frac':>6}") ordered = sorted(summaries.items(), key=lambda x: x[1], reverse=True) for entry, value in ordered: - print(f" {entry:<70} {value:6.2f} {100*value / total_time:6.2f}%") + print(f" {entry:<40} {value:6.2f} {100*value / total_time:6.2f}%") ts = total_time / 1000 From ec0ab2d2d0eb5b7be3a7fa4bb93953cf57a9e364 Mon Sep 17 00:00:00 2001 From: ademeure Date: Sun, 5 May 2024 00:00:58 +0100 Subject: [PATCH 012/172] Remove arch=native as it only available on recent CUDA versions --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index a755c309f..05a02c387 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ CUDA_OUTPUT_FILE = -o $@ # -t=0 is short for --threads, 0 = number of CPUs on the machine # include PTX for both SM52 (Maxwell) and SM80 (Ampere, our main optimisation target at the moment) + native SASS for current GPU # this increases compile time by ~5%, but we need >=SM70 PTX for some optimisations, and it allows "cuobjdump --dump-sass" to work -NVCC_FLAGS = -O3 -t=0 --use_fast_math -gencode=arch=compute_52,code=compute_52 -gencode=arch=compute_80,code=compute_80 -arch=native +NVCC_FLAGS = -O3 -t=0 --use_fast_math -gencode=arch=compute_52,code=compute_52 -gencode=arch=compute_80,code=sm_80 NVCC_LDFLAGS = -lcublas -lcublasLt NVCC_INCLUDES = NVCC_LDLIBS = From 83ec4b8e701be6488ae632184731bd2f5e8af8bc Mon Sep 17 00:00:00 2001 From: ademeure Date: Sun, 5 May 2024 02:09:52 +0100 Subject: [PATCH 013/172] Slightly reduce lines of code in cudnn_att --- cudnn_att.cu | 164 +++++++++++++++++++++------------------------------ 1 file changed, 68 insertions(+), 96 deletions(-) diff --git a/cudnn_att.cu b/cudnn_att.cu index 2735bbd14..fdff63483 100644 --- a/cudnn_att.cu +++ b/cudnn_att.cu @@ -5,18 +5,22 @@ #include #include #include +namespace fe = cudnn_frontend; // Specific configurations based on the enabled precision #if defined(ENABLE_FP32) typedef float floatX; - +static_assert(false, "cuDNN is not supported in FP32 mode.") // use fp16 (note: this may require gradient scaler, currently not implemented!) + #elif defined(ENABLE_FP16) typedef half floatX; #define CUBLAS_LOWP CUDA_R_16F - +#define CUDNN_16BIT fe::DataType_t::HALF #else // Default to bfloat16 + typedef __nv_bfloat16 floatX; +#define CUDNN_16BIT fe::DataType_t::BFLOAT16 #endif // CUDA error checking @@ -30,28 +34,17 @@ static void cudaCheck(cudaError_t error, const char *file, int line) { #define cudaCheck(err) (cudaCheck(err, __FILE__, __LINE__)) // Profiler utils -namespace { - class NvtxRange { - public: - NvtxRange(const char* s) { nvtxRangePush(s); } - - NvtxRange(const std::string& base_str, int number) { - std::string range_string = base_str + " " + std::to_string(number); - nvtxRangePush(range_string.c_str()); - } - - ~NvtxRange() { nvtxRangePop(); } - }; -} +class NvtxRange { + public: + NvtxRange(const char* s) { nvtxRangePush(s); } + NvtxRange(const std::string& base_str, int number) { + std::string range_string = base_str + " " + std::to_string(number); + nvtxRangePush(range_string.c_str()); + } + ~NvtxRange() { nvtxRangePop(); } +}; #define NVTX_RANGE_FN() NvtxRange nvtx_range(__FUNCTION__) -namespace fe = cudnn_frontend; -#if CUBLAS_LOWP == CUDA_R_16BF -#define CUDNN_16BIT fe::DataType_t::BFLOAT16 -#else -#define CUDNN_16BIT fe::DataType_t::HALF -#endif - static cudnnHandle_t cudnn_handle; static size_t cudnn_workspace_size = 0; // dynamically allocated as needed (up to 256MiB!) static void* cudnn_workspace = NULL; @@ -99,28 +92,24 @@ auto lookup_cache_or_build_graph_fwd(Args... args) { auto graph = std::make_shared(); graph->set_io_data_type(CUDNN_16BIT) - .set_intermediate_data_type(fe::DataType_t::FLOAT) - .set_compute_data_type(fe::DataType_t::FLOAT); + .set_intermediate_data_type(fe::DataType_t::FLOAT) + .set_compute_data_type(fe::DataType_t::FLOAT); // QKV is (B, T, 3, NH, HS) which cuDNN can handle directly without an external permute - auto Q = graph->tensor(fe::graph::Tensor_attributes() - .set_name("Q") + auto Q = graph->tensor(fe::graph::Tensor_attributes().set_name("Q") .set_dim({B, H, T, HS}) .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1})); - auto K = graph->tensor(fe::graph::Tensor_attributes() - .set_name("K") + auto K = graph->tensor(fe::graph::Tensor_attributes().set_name("K") .set_dim({B, H, T, HS}) .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1})); - auto V = graph->tensor(fe::graph::Tensor_attributes() - .set_name("V") + auto V = graph->tensor(fe::graph::Tensor_attributes().set_name("V") .set_dim({B, H, T, HS}) .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1})); - auto attn_scale = graph->tensor(fe::graph::Tensor_attributes() - .set_name("attn_scale") - .set_dim({1, 1, 1, 1}) - .set_stride({1, 1, 1, 1}) - .set_is_pass_by_value(true) - .set_data_type(fe::DataType_t::FLOAT)); + auto attn_scale = graph->tensor(fe::graph::Tensor_attributes().set_name("attn_scale") + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_is_pass_by_value(true) + .set_data_type(fe::DataType_t::FLOAT)); auto sdpa_options = fe::graph::SDPA_attributes().set_name("flash_attention"); sdpa_options.set_is_inference(is_inference_only); @@ -136,8 +125,8 @@ auto lookup_cache_or_build_graph_fwd(Args... args) { assert(stats == nullptr || is_inference_only == false); if (is_inference_only == false) { stats->set_output(true).set_data_type(fe::DataType_t::FLOAT) - .set_dim({B, H, T, 1}) - .set_stride({H * T, T, 1, 1}); + .set_dim({B, H, T, 1}) + .set_stride({H * T, T, 1, 1}); } checkCudnnFE(graph->validate()); @@ -152,6 +141,7 @@ auto lookup_cache_or_build_graph_fwd(Args... args) { auto plans = graph->create_execution_plans({fe::HeurMode_t::A}); checkCudnnFE(graph->check_support(cudnn_handle)); checkCudnnFE(graph->build_plans(cudnn_handle)); + assert(graph->get_workspace_size() <= cudnn_workspace_size); // fwd shouldn't need workspace auto tuple = std::make_tuple(graph, Q, K, V, attn_scale, O, stats); user_maintained_cache_fwd.insert({key, tuple}); @@ -165,47 +155,39 @@ auto lookup_cache_or_build_graph_bwd(Args... args) { auto graph = std::make_shared(); graph->set_io_data_type(CUDNN_16BIT) - .set_intermediate_data_type(fe::DataType_t::FLOAT) - .set_compute_data_type(fe::DataType_t::FLOAT); + .set_intermediate_data_type(fe::DataType_t::FLOAT) + .set_compute_data_type(fe::DataType_t::FLOAT); // (B, N, 3, NH, HS) // must come from inp (which means we also need to convert THAT to FP16) - auto Q = graph->tensor(fe::graph::Tensor_attributes() - .set_name("Q") - .set_dim({B, NH, T, HS}) - .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1})); - auto K = graph->tensor(fe::graph::Tensor_attributes() - .set_name("K") - .set_dim({B, NH, T, HS}) - .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1})); - auto V = graph->tensor(fe::graph::Tensor_attributes() - .set_name("V") - .set_dim({B, NH, T, HS}) - .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1})); - auto O = graph->tensor(fe::graph::Tensor_attributes() - .set_name("O") - .set_dim({B, NH, T, HS}) - .set_stride({NH * HS * T, HS, NH * HS, 1})); - auto dO = graph->tensor(fe::graph::Tensor_attributes() - .set_name("dO") - .set_dim({B, NH, T, HS}) - .set_stride({NH * HS * T, HS, NH * HS, 1})); - - auto stats = graph->tensor(fe::graph::Tensor_attributes() - .set_name("stats") - .set_dim({B, NH, T, 1}) - .set_stride({NH * T, T, 1, 1}) - .set_data_type(fe::DataType_t::FLOAT)); - auto attn_scale = graph->tensor(fe::graph::Tensor_attributes() - .set_name("attn_scale") - .set_dim({1, 1, 1, 1}) - .set_stride({1, 1, 1, 1}) - .set_is_pass_by_value(true) - .set_data_type(fe::DataType_t::FLOAT)); - auto sdpa_backward_options = fe::graph::SDPA_backward_attributes() - .set_name("flash_attention_backward") - .set_causal_mask(true) - .set_attn_scale(attn_scale); + auto Q = graph->tensor(fe::graph::Tensor_attributes().set_name("Q") + .set_dim({B, NH, T, HS}) + .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1})); + auto K = graph->tensor(fe::graph::Tensor_attributes().set_name("K") + .set_dim({B, NH, T, HS}) + .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1})); + auto V = graph->tensor(fe::graph::Tensor_attributes().set_name("V") + .set_dim({B, NH, T, HS}) + .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1})); + auto O = graph->tensor(fe::graph::Tensor_attributes().set_name("O") + .set_dim({B, NH, T, HS}) + .set_stride({NH * HS * T, HS, NH * HS, 1})); + auto dO = graph->tensor(fe::graph::Tensor_attributes().set_name("dO") + .set_dim({B, NH, T, HS}) + .set_stride({NH * HS * T, HS, NH * HS, 1})); + + auto stats = graph->tensor(fe::graph::Tensor_attributes().set_name("stats") + .set_dim({B, NH, T, 1}) + .set_stride({NH * T, T, 1, 1}) + .set_data_type(fe::DataType_t::FLOAT)); + auto attn_scale = graph->tensor(fe::graph::Tensor_attributes().set_name("attn_scale") + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_is_pass_by_value(true) + .set_data_type(fe::DataType_t::FLOAT)); + auto sdpa_backward_options = fe::graph::SDPA_backward_attributes().set_name("flash_attention_backward") + .set_causal_mask(true) + .set_attn_scale(attn_scale); // Create the graph operation and get the output tensors back auto [dQ, dK, dV] = graph->sdpa_backward(Q, K, V, O, dO, stats, sdpa_backward_options); @@ -227,6 +209,16 @@ auto lookup_cache_or_build_graph_bwd(Args... args) { checkCudnnFE(graph->check_support(cudnn_handle)); checkCudnnFE(graph->build_plans(cudnn_handle)); + // Reallocate the workspace if the required size is greater than the current workspace + // By default, cuDNN uses up to 256MiB of workspace, so we don't want to just allocate the maximum + if (graph->get_workspace_size() > cudnn_workspace_size) { + if (cudnn_workspace_size > 0) { + cudaCheck(cudaFree(cudnn_workspace)); + } + cudnn_workspace_size = graph->get_workspace_size(); + cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size)); + } + auto tuple = std::make_tuple(graph, Q, K, V, O, dO, stats, attn_scale, dQ, dK, dV); user_maintained_cache_bwd.insert({key, tuple}); return tuple; @@ -260,16 +252,6 @@ void attention_forward_cudnn(floatX* out, // output: (B, T, NH, HS) variant_pack[softmax_stats] = stats; } - // Reallocate the workspace if the required size is greater than the current workspace - // By default, cuDNN uses up to 256MiB of workspace, so we don't want to just allocate the maximum - if (graph->get_workspace_size() > cudnn_workspace_size) { - if (cudnn_workspace_size > 0) { - cudaCheck(cudaFree(cudnn_workspace)); - } - cudnn_workspace_size = graph->get_workspace_size(); - cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size)); - } - // Execute graph checkCudnnFE(graph->execute(cudnn_handle, variant_pack, cudnn_workspace)); cudaCheck(cudaGetLastError()); @@ -304,16 +286,6 @@ void attention_backward_cudnn(floatX* dqkvr, {dQ, devPtrdQ}, {dK, devPtrdK}, {dV, devPtrdV}, {attn_scale, &attn_scale_cpu}}; - // Reallocate the workspace if the required size is greater than the current workspace - // By default, cuDNN uses up to 256MiB of workspace, so we don't want to just allocate the maximum - if (graph->get_workspace_size() > cudnn_workspace_size) { - if (cudnn_workspace_size > 0) { - cudaCheck(cudaFree(cudnn_workspace)); - } - cudnn_workspace_size = graph->get_workspace_size(); - cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size)); - } - // Execute graph checkCudnnFE(graph->execute(cudnn_handle, variant_pack, cudnn_workspace)); cudaCheck(cudaGetLastError()); From 8675104b73a04b8fcaec393a5e5be357713c5b45 Mon Sep 17 00:00:00 2001 From: ademeure Date: Sun, 5 May 2024 02:24:31 +0100 Subject: [PATCH 014/172] Compile for the user's GPU architecture using nvidia-smi query on Linux --- Makefile | 11 ++++++++--- cudnn_att.cu | 4 ++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 97e67b97c..efdf4ddf3 100644 --- a/Makefile +++ b/Makefile @@ -13,9 +13,7 @@ CUDA_OUTPUT_FILE = -o $@ # NVCC flags # -t=0 is short for --threads, 0 = number of CPUs on the machine -# include PTX for both SM52 (Maxwell) and SM80 (Ampere, our main optimisation target at the moment) + native SASS for current GPU -# this increases compile time by ~5%, but we need >=SM70 PTX for some optimisations, and it allows "cuobjdump --dump-sass" to work -NVCC_FLAGS = -O3 -t=0 --use_fast_math -gencode=arch=compute_52,code=compute_52 -gencode=arch=compute_80,code=sm_80 +NVCC_FLAGS = -O3 -t=0 --use_fast_math NVCC_LDFLAGS = -lcublas -lcublasLt NVCC_INCLUDES = NVCC_LDLIBS = @@ -24,6 +22,13 @@ NVCC_CUDNN = # overridable flag for multi-GPU training. by default we won't build with cudnn # because it bloats up the compile time from a few seconds to ~minute USE_CUDNN ?= 0 +# on linux, try to use nvidia-smi to detect the user's GPU and compile for that specific architecture +ifeq ($(SHELL_UNAME), Linux) + NVCC_ARCH := $(shell which nvidia-smi > /dev/null 2>&1 && nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits | head -n 1 | sed 's/\.//g') + ifdef NVCC_ARCH + NVCC_FLAGS += -gencode arch=compute_$(NVCC_ARCH),code=sm_$(NVCC_ARCH) + endif +endif # autodect a lot of various supports on current platform $(info ---------------------------------------------) diff --git a/cudnn_att.cu b/cudnn_att.cu index fdff63483..398e4e4c3 100644 --- a/cudnn_att.cu +++ b/cudnn_att.cu @@ -11,14 +11,14 @@ namespace fe = cudnn_frontend; #if defined(ENABLE_FP32) typedef float floatX; static_assert(false, "cuDNN is not supported in FP32 mode.") -// use fp16 (note: this may require gradient scaler, currently not implemented!) +// use fp16 (note: this may require gradient scaler, currently not implemented!) #elif defined(ENABLE_FP16) typedef half floatX; #define CUBLAS_LOWP CUDA_R_16F #define CUDNN_16BIT fe::DataType_t::HALF -#else // Default to bfloat16 +#else // Default to bfloat16 typedef __nv_bfloat16 floatX; #define CUDNN_16BIT fe::DataType_t::BFLOAT16 #endif From 7789738879c214e51227ad633ecaddb7b2405d8f Mon Sep 17 00:00:00 2001 From: ademeure Date: Sun, 5 May 2024 02:44:24 +0100 Subject: [PATCH 015/172] Add PTX back to binary + fix whitespaces --- Makefile | 2 +- cudnn_att.cu | 8 ++++---- train_gpt2.cu | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index efdf4ddf3..583f86366 100644 --- a/Makefile +++ b/Makefile @@ -26,7 +26,7 @@ USE_CUDNN ?= 0 ifeq ($(SHELL_UNAME), Linux) NVCC_ARCH := $(shell which nvidia-smi > /dev/null 2>&1 && nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits | head -n 1 | sed 's/\.//g') ifdef NVCC_ARCH - NVCC_FLAGS += -gencode arch=compute_$(NVCC_ARCH),code=sm_$(NVCC_ARCH) + NVCC_FLAGS += -gencode arch=compute_$(NVCC_ARCH),code=compute_$(NVCC_ARCH),code=sm_$(NVCC_ARCH) endif endif diff --git a/cudnn_att.cu b/cudnn_att.cu index 398e4e4c3..4664d1827 100644 --- a/cudnn_att.cu +++ b/cudnn_att.cu @@ -106,10 +106,10 @@ auto lookup_cache_or_build_graph_fwd(Args... args) { .set_dim({B, H, T, HS}) .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1})); auto attn_scale = graph->tensor(fe::graph::Tensor_attributes().set_name("attn_scale") - .set_dim({1, 1, 1, 1}) - .set_stride({1, 1, 1, 1}) - .set_is_pass_by_value(true) - .set_data_type(fe::DataType_t::FLOAT)); + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_is_pass_by_value(true) + .set_data_type(fe::DataType_t::FLOAT)); auto sdpa_options = fe::graph::SDPA_attributes().set_name("flash_attention"); sdpa_options.set_is_inference(is_inference_only); diff --git a/train_gpt2.cu b/train_gpt2.cu index bc07ca1ff..dbc25677b 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -2094,14 +2094,14 @@ void common_start(bool override_enable_tf32 = true) { cublasCheck(cublasSetStream(cublas_handle, main_stream)); cublasCheck(cublasLtCreate(&cublaslt_handle)); cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size)); - + // TF32 precision is equivalent to torch.set_float32_matmul_precision('high') bool enable_tf32 = PRECISION_MODE == PRECISION_FP32 && deviceProp.major >= 8 && override_enable_tf32; cublasCheck(cublasSetMathMode(cublas_handle, enable_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH)); cublas_compute = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F; // setup the (global) cuBLASLt workspace cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size)); - + create_cudnn(); } From c15ca1f4cf1508421004020a694199b3eecaa891 Mon Sep 17 00:00:00 2001 From: ademeure Date: Sun, 5 May 2024 02:49:16 +0100 Subject: [PATCH 016/172] Fix makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 583f86366..219748185 100644 --- a/Makefile +++ b/Makefile @@ -26,7 +26,7 @@ USE_CUDNN ?= 0 ifeq ($(SHELL_UNAME), Linux) NVCC_ARCH := $(shell which nvidia-smi > /dev/null 2>&1 && nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits | head -n 1 | sed 's/\.//g') ifdef NVCC_ARCH - NVCC_FLAGS += -gencode arch=compute_$(NVCC_ARCH),code=compute_$(NVCC_ARCH),code=sm_$(NVCC_ARCH) + NVCC_FLAGS += -gencode arch=compute_$(NVCC_ARCH),code=compute_$(NVCC_ARCH) -gencode arch=compute_$(NVCC_ARCH),code=sm_$(NVCC_ARCH) endif endif From 9663719c628193e3a61403c6c2e49e7c335628b4 Mon Sep 17 00:00:00 2001 From: Ross Wheeler Date: Sat, 4 May 2024 23:24:41 -0700 Subject: [PATCH 017/172] Minor fixes for Make for cudnn and windows support Tested on Ubuntu Linux 22.04 and Windows 11 --- Makefile | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index 04cbfbb2a..f67ba3f72 100644 --- a/Makefile +++ b/Makefile @@ -67,16 +67,16 @@ else endif # Check and include cudnn if available -# Currently hard-coding a bunch of stuff here for Linux, todo make this better/nicer +# You can override the path to cudnn frontend by setting CUDNN_FRONTEND_PATH=your_path on the make command line # You need cuDNN from: https://developer.nvidia.com/cudnn -# Follow the apt-get instructions +# Follow the apt-get instructions or Windows instructions to install the cuDNN library # And the cuDNN front-end from: https://github.com/NVIDIA/cudnn-frontend/tree/main -# For this there is no installation, just download the repo to your home directory +# For this there is no installation, just download the repo to your home directory or directory of your choice # and then we include it below (see currently hard-coded path assumed in home directory) ifeq ($(USE_CUDNN), 1) ifeq ($(SHELL_UNAME), Linux) # hard-coded path for now - CUDNN_FRONTEND_PATH := $(HOME)/cudnn-frontend/include + CUDNN_FRONTEND_PATH ?= $(HOME)/cudnn-frontend/include ifeq ($(shell [ -d $(CUDNN_FRONTEND_PATH) ] && echo "exists"), exists) $(info ✓ cuDNN found, will run with flash-attention) NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH) @@ -87,7 +87,19 @@ ifeq ($(USE_CUDNN), 1) $(error ✗ cuDNN not found. See the Makefile for our currently hard-coded paths / install instructions) endif else - $(info → cuDNN is not supported right now outside of Linux) + ifneq ($(OS), Windows_NT) + $(info → cuDNN is not supported on MAC OS right now) + else + $(info ✓ Windows cuDNN found, will run with flash-attention) + CUDNN_FRONTEND_PATH ?= ..\..\cudnn-frontend\include #override on command line if different location + CUDNN_INCLUDE_PATH ?= -I"C:\Program Files\NVIDIA\CUDNN\v9.1\include\12.4" + CUDNN_FRONTEND_PATH += $(CUDNN_INCLUDE_PATH) + NVCC_FLAGS += --std c++20 -Xcompiler "/std:c++20" -Xcompiler "/EHsc /W0 /nologo /Ox /FS" -maxrregcount=0 --machine 64 + NVCC_CUDNN = cudnn_att.obj + NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH) + NVCC_LDFLAGS += -L"C:\Program Files\NVIDIA\CUDNN\v9.1\lib\12.4\x64" -lcudnn + NVCC_FLAGS += -DENABLE_CUDNN + endif endif else $(info → cuDNN is manually disabled by default, run make with `USE_CUDNN=1` to try to enable) @@ -191,28 +203,28 @@ $(info ---------------------------------------------) all: $(TARGETS) train_gpt2: train_gpt2.c - $(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $< $(LDLIBS) $(OUTPUT_FILE) + $(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $^ $(LDLIBS) $(OUTPUT_FILE) test_gpt2: test_gpt2.c - $(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $< $(LDLIBS) $(OUTPUT_FILE) + $(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $^ $(LDLIBS) $(OUTPUT_FILE) -cudnn_att.o: cudnn_att.cu - $(NVCC) -c $(NVCC_FLAGS) $(PFLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) +$(NVCC_CUDNN): cudnn_att.cu + $(NVCC) -c $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_INCLUDES) train_gpt2cu: train_gpt2.cu $(NVCC_CUDNN) - $(NVCC) $(NVCC_FLAGS) $(PFLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) $(NVCC_CUDNN) + $(NVCC) $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) train_gpt2fp32cu: train_gpt2_fp32.cu - $(NVCC) $(NVCC_FLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) + $(NVCC) $(NVCC_FLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) test_gpt2cu: test_gpt2.cu $(NVCC_CUDNN) - $(NVCC) $(NVCC_FLAGS) $(PFLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) $(NVCC_CUDNN) + $(NVCC) $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) test_gpt2fp32cu: test_gpt2_fp32.cu - $(NVCC) $(NVCC_FLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) + $(NVCC) $(NVCC_FLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) profile_gpt2cu: profile_gpt2.cu $(NVCC_CUDNN) - $(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) $(NVCC_CUDNN) + $(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) clean: $(REMOVE_FILES) $(TARGETS) From 2d4e5fd840e9c456c459a5890b40c9bb1746b737 Mon Sep 17 00:00:00 2001 From: Ross Wheeler Date: Sat, 4 May 2024 23:24:41 -0700 Subject: [PATCH 018/172] Minor fixes for Makefile for cudnn and windows support Tested on Ubuntu Linux 22.04 and Windows 11 --- Makefile | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index 04cbfbb2a..f67ba3f72 100644 --- a/Makefile +++ b/Makefile @@ -67,16 +67,16 @@ else endif # Check and include cudnn if available -# Currently hard-coding a bunch of stuff here for Linux, todo make this better/nicer +# You can override the path to cudnn frontend by setting CUDNN_FRONTEND_PATH=your_path on the make command line # You need cuDNN from: https://developer.nvidia.com/cudnn -# Follow the apt-get instructions +# Follow the apt-get instructions or Windows instructions to install the cuDNN library # And the cuDNN front-end from: https://github.com/NVIDIA/cudnn-frontend/tree/main -# For this there is no installation, just download the repo to your home directory +# For this there is no installation, just download the repo to your home directory or directory of your choice # and then we include it below (see currently hard-coded path assumed in home directory) ifeq ($(USE_CUDNN), 1) ifeq ($(SHELL_UNAME), Linux) # hard-coded path for now - CUDNN_FRONTEND_PATH := $(HOME)/cudnn-frontend/include + CUDNN_FRONTEND_PATH ?= $(HOME)/cudnn-frontend/include ifeq ($(shell [ -d $(CUDNN_FRONTEND_PATH) ] && echo "exists"), exists) $(info ✓ cuDNN found, will run with flash-attention) NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH) @@ -87,7 +87,19 @@ ifeq ($(USE_CUDNN), 1) $(error ✗ cuDNN not found. See the Makefile for our currently hard-coded paths / install instructions) endif else - $(info → cuDNN is not supported right now outside of Linux) + ifneq ($(OS), Windows_NT) + $(info → cuDNN is not supported on MAC OS right now) + else + $(info ✓ Windows cuDNN found, will run with flash-attention) + CUDNN_FRONTEND_PATH ?= ..\..\cudnn-frontend\include #override on command line if different location + CUDNN_INCLUDE_PATH ?= -I"C:\Program Files\NVIDIA\CUDNN\v9.1\include\12.4" + CUDNN_FRONTEND_PATH += $(CUDNN_INCLUDE_PATH) + NVCC_FLAGS += --std c++20 -Xcompiler "/std:c++20" -Xcompiler "/EHsc /W0 /nologo /Ox /FS" -maxrregcount=0 --machine 64 + NVCC_CUDNN = cudnn_att.obj + NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH) + NVCC_LDFLAGS += -L"C:\Program Files\NVIDIA\CUDNN\v9.1\lib\12.4\x64" -lcudnn + NVCC_FLAGS += -DENABLE_CUDNN + endif endif else $(info → cuDNN is manually disabled by default, run make with `USE_CUDNN=1` to try to enable) @@ -191,28 +203,28 @@ $(info ---------------------------------------------) all: $(TARGETS) train_gpt2: train_gpt2.c - $(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $< $(LDLIBS) $(OUTPUT_FILE) + $(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $^ $(LDLIBS) $(OUTPUT_FILE) test_gpt2: test_gpt2.c - $(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $< $(LDLIBS) $(OUTPUT_FILE) + $(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $^ $(LDLIBS) $(OUTPUT_FILE) -cudnn_att.o: cudnn_att.cu - $(NVCC) -c $(NVCC_FLAGS) $(PFLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) +$(NVCC_CUDNN): cudnn_att.cu + $(NVCC) -c $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_INCLUDES) train_gpt2cu: train_gpt2.cu $(NVCC_CUDNN) - $(NVCC) $(NVCC_FLAGS) $(PFLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) $(NVCC_CUDNN) + $(NVCC) $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) train_gpt2fp32cu: train_gpt2_fp32.cu - $(NVCC) $(NVCC_FLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) + $(NVCC) $(NVCC_FLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) test_gpt2cu: test_gpt2.cu $(NVCC_CUDNN) - $(NVCC) $(NVCC_FLAGS) $(PFLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) $(NVCC_CUDNN) + $(NVCC) $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) test_gpt2fp32cu: test_gpt2_fp32.cu - $(NVCC) $(NVCC_FLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) + $(NVCC) $(NVCC_FLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) profile_gpt2cu: profile_gpt2.cu $(NVCC_CUDNN) - $(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) $(NVCC_CUDNN) + $(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) clean: $(REMOVE_FILES) $(TARGETS) From 9910a4086394ba5498e4d0191a0f058720324c47 Mon Sep 17 00:00:00 2001 From: ademeure Date: Sun, 5 May 2024 14:42:32 +0100 Subject: [PATCH 019/172] Removed makefile change so we can integrate #339 instead which feels slightly cleaner --- Makefile | 7 ------- 1 file changed, 7 deletions(-) diff --git a/Makefile b/Makefile index 219748185..04cbfbb2a 100644 --- a/Makefile +++ b/Makefile @@ -22,13 +22,6 @@ NVCC_CUDNN = # overridable flag for multi-GPU training. by default we won't build with cudnn # because it bloats up the compile time from a few seconds to ~minute USE_CUDNN ?= 0 -# on linux, try to use nvidia-smi to detect the user's GPU and compile for that specific architecture -ifeq ($(SHELL_UNAME), Linux) - NVCC_ARCH := $(shell which nvidia-smi > /dev/null 2>&1 && nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits | head -n 1 | sed 's/\.//g') - ifdef NVCC_ARCH - NVCC_FLAGS += -gencode arch=compute_$(NVCC_ARCH),code=compute_$(NVCC_ARCH) -gencode arch=compute_$(NVCC_ARCH),code=sm_$(NVCC_ARCH) - endif -endif # autodect a lot of various supports on current platform $(info ---------------------------------------------) From 876ab93c0a9df1c1b3765f3df3a927364685830b Mon Sep 17 00:00:00 2001 From: ademeure Date: Sun, 5 May 2024 15:05:59 +0100 Subject: [PATCH 020/172] Add FP16 path for atomicStochasticAdd (+remove __bfloat1622float2 to work on older CUDAs) + fixes --- cudnn_att.cu | 20 +++++++++++--------- profile_gpt2.cu | 2 +- test_gpt2.cu | 2 +- train_gpt2.cu | 26 +++++++++++++++----------- 4 files changed, 28 insertions(+), 22 deletions(-) diff --git a/cudnn_att.cu b/cudnn_att.cu index 4664d1827..fd9760b1a 100644 --- a/cudnn_att.cu +++ b/cudnn_att.cu @@ -34,15 +34,17 @@ static void cudaCheck(cudaError_t error, const char *file, int line) { #define cudaCheck(err) (cudaCheck(err, __FILE__, __LINE__)) // Profiler utils -class NvtxRange { - public: - NvtxRange(const char* s) { nvtxRangePush(s); } - NvtxRange(const std::string& base_str, int number) { - std::string range_string = base_str + " " + std::to_string(number); - nvtxRangePush(range_string.c_str()); - } - ~NvtxRange() { nvtxRangePop(); } -}; +namespace { + class NvtxRange { + public: + NvtxRange(const char* s) { nvtxRangePush(s); } + NvtxRange(const std::string& base_str, int number) { + std::string range_string = base_str + " " + std::to_string(number); + nvtxRangePush(range_string.c_str()); + } + ~NvtxRange() { nvtxRangePop(); } + }; +} #define NVTX_RANGE_FN() NvtxRange nvtx_range(__FUNCTION__) static cudnnHandle_t cudnn_handle; diff --git a/profile_gpt2.cu b/profile_gpt2.cu index 8c12628e5..c29cd6a08 100644 --- a/profile_gpt2.cu +++ b/profile_gpt2.cu @@ -28,7 +28,7 @@ the profile.ncu-rep from a cloud box to local to pretty view. #include "train_gpt2.cu" int main() { - common_start(); + common_start(true, true); // build the GPT-2 model from a checkpoint GPT2 model; diff --git a/test_gpt2.cu b/test_gpt2.cu index 3fc6b6f0e..d7944125c 100644 --- a/test_gpt2.cu +++ b/test_gpt2.cu @@ -83,7 +83,7 @@ float* float_cpu_malloc_and_point_parameters(FloatParameterTensors* params, size } int main(int argc, char *argv[]) { - common_start(false); + common_start(false, true); // build the GPT-2 model from a checkpoint GPT2 model; diff --git a/train_gpt2.cu b/train_gpt2.cu index dbc25677b..1a1a53752 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -473,19 +473,21 @@ __global__ void encoder_forward_kernel3(floatX* out, store128(out_btc, packed_out); } -__device__ void atomicStochasticAdd(__nv_bfloat16* address, float val0, float val1, uint seed) { +template +__device__ void atomicStochasticAdd(T* address, float val0, float val1, uint seed) { + static_assert(sizeof(T) == 2, "Only 16-bit atomicStochasticAdd supported."); float2 val = make_float2(val0, val1); uint* address_as_uint = (uint*)address; uint old = *address_as_uint, assumed; uint random = Get2dNoiseUint(threadIdx.x, blockIdx.x, seed); do { assumed = old; - float2 old_fp32 = __bfloat1622float2(*(__nv_bfloat162*)&old); - float2 new_fp32 = make_float2(old_fp32.x + val.x, old_fp32.y + val.y); - __nv_bfloat162 new_bf16; - stochastic_rounding(new_fp32.x, &new_bf16.x, random); - stochastic_rounding(new_fp32.y, &new_bf16.y, random >> 16); - old = atomicCAS(address_as_uint, assumed, *(uint*)&new_bf16); + float2 new_fp32 = make_float2((float)(reinterpret_cast(&old)[0]) + val.x, + (float)(reinterpret_cast(&old)[1]) + val.y); + T new_rounded[2]; + stochastic_rounding(new_fp32.x, &new_rounded[0], random); + stochastic_rounding(new_fp32.y, &new_rounded[1], random >> 16); + old = atomicCAS(address_as_uint, assumed, *(uint*)&new_rounded); } while (assumed != old); } __device__ void atomicStochasticAdd(float* address, float val0, float val1, uint seed) { @@ -2074,12 +2076,14 @@ void gpt2_free(GPT2 *model) { // ---------------------------------------------------------------------------- // common init & free code for train/test/profile -void common_start(bool override_enable_tf32 = true) { +void common_start(bool override_enable_tf32 = true, bool print_device_info = true) { int deviceIdx = 0; cudaCheck(cudaSetDevice(deviceIdx)); cudaGetDeviceProperties(&deviceProp, deviceIdx); - printf("[System]\n"); - printf("Device %d: %s\n", deviceIdx, deviceProp.name); + if (print_device_info) { + printf("[System]\n"); + printf("Device %d: %s\n", deviceIdx, deviceProp.name); + } cudaCheck(cudaStreamCreate(&main_stream)); cudaEventCreateWithFlags(&main_event, cudaEventDisableTiming); @@ -2335,7 +2339,7 @@ int main(int argc, char *argv[]) { printf0("| use_master_weights | %-50s |\n", use_master_weights ? "enabled" : "disabled"); printf0("+-----------------------+----------------------------------------------------+\n"); - common_start(override_enable_tf32); // common init code for train/test/profile + common_start(override_enable_tf32, false); // common init code for train/test/profile const char* precision_str = (PRECISION_MODE == PRECISION_FP32) ? (cublas_compute == CUBLAS_COMPUTE_32F_FAST_TF32 ? "TF32" : "FP32") From 804a9af1dc2cc4a387dacfa28aba658ce14262a6 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Sun, 5 May 2024 23:43:02 +0300 Subject: [PATCH 021/172] make things compile with nvcc11 --- train_gpt2.cu | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index 1a1a53752..86ceb5caa 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -167,6 +167,21 @@ void mpi_check(int status, const char *file, int line) { #define mpiCheck(err) (mpi_check(err, __FILE__, __LINE__)) #endif +// older nvcc does not provide __ldcs and __stcs for bfloat16, despite these actually just being unsigned shorts. +// we need to be careful here to only define our own versions if none already exist, otherwise the compiler will +// complain. +// If not, you easily get "no viable overload" (for sm52) and "function already exists" (sm_80) +#if defined(ENABLE_BF16) and __CUDACC_VER_MAJOR__ < 12 and not(__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) +__device__ floatX __ldcs(const floatX* address) { + unsigned short bf = __ldcs(reinterpret_cast(address)); + return __nv_bfloat16_raw{bf}; +} + +__device__ void __stcs(floatX* address, floatX value) { + __stcs(reinterpret_cast(address), ((__nv_bfloat16_raw)value).x); +} +#endif + // warp-level reduction for summing values __device__ float warpReduceSum(float val) { for (int offset = 16; offset > 0; offset /= 2) { @@ -1056,7 +1071,7 @@ __global__ void copy_and_cast_kernel(float* dst, const floatX* src, size_t n) { __global__ void cast_and_add_kernel(floatX* dst, const float* src, size_t n) { // used only for matmul_backward_bias kernel, a little bit embarassing TODO delete later const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < n) { dst[idx] += (floatX)src[idx]; } // have to += because dbias is a paramater + if (idx < n) { dst[idx] = (floatX)((float)dst[idx] + src[idx]); } // have to += because dbias is a paramater } // ---------------------------------------------------------------------------- From bbfe8c989c06c145f33c6b3890cdc688dc2dca2c Mon Sep 17 00:00:00 2001 From: lancer Date: Sun, 5 May 2024 16:59:58 -0700 Subject: [PATCH 022/172] Minor update on the code --- dev/cuda/classifier_fused.cu | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/dev/cuda/classifier_fused.cu b/dev/cuda/classifier_fused.cu index df6894113..9202c2cee 100644 --- a/dev/cuda/classifier_fused.cu +++ b/dev/cuda/classifier_fused.cu @@ -5,7 +5,7 @@ much of a restriction: In pretraining, it is just a constant 1/batch_size tensor out the input prompt, but that is known in advance. Compile example: -nvcc -O3 --use_fast_math classifier_fused.cu -o classifier_fused +nvcc -O3 --use_fast_math -lcublas -lcublasLt classifier_fused.cu -o classifier_fused ./classifier_fused 1 ./classifier_fused 2 @@ -198,7 +198,7 @@ __device__ SoftmaxParams prepare_softmax_blockwide(cg::thread_block_tile<32>& wa float thread_sumval = 0.0f; // do the loop in reverse to maximise probability of L2 cache hits // so even small L2s get some hits on the 2nd read of the same thread - for (int i = (V+3)/4 + threadIdx.x - blockDim.x; i >= 0; i -= blockDim.x) { + for (int i = ceil_div(V, 4) + threadIdx.x - blockDim.x; i >= 0; i -= blockDim.x) { float4 v4 = x_vec4[i]; #pragma unroll for(int k = 0; k < 4; k++) { @@ -207,7 +207,7 @@ __device__ SoftmaxParams prepare_softmax_blockwide(cg::thread_block_tile<32>& wa } float old_maxval = thread_maxval; thread_maxval = fmaxf(thread_maxval, vec_at(v4, k)); - thread_sumval *= expf((old_maxval - thread_maxval)); + thread_sumval *= expf(old_maxval - thread_maxval); thread_sumval += expf(vec_at(v4, k) - thread_maxval); } } @@ -270,7 +270,7 @@ __global__ void fused_classifier_kernel2(float* dlogits, float* losses, float* p // calculate the gradients directly, saves bandwidth from probs during training // but also supports writing probs for inference-only and debugging const float4* logits_vec4 = reinterpret_cast(logits + idx * P); - for (int i = threadIdx.x; i < (V+3)/4; i += blockDim.x) { + for (int i = threadIdx.x; i < ceil_div(V, 4); i += blockDim.x) { // this is the 2nd read of logits after the one in prepare_softmax2 // this data will never be needed again, so we reduce cache persistence float4 v4 = __ldcs(&logits_vec4[i]); @@ -307,7 +307,7 @@ __device__ SoftmaxParams prepare_softmax_blockwide_nofloat4(cg::thread_block_til float v = x[i]; float old_maxval = thread_maxval; thread_maxval = fmaxf(thread_maxval, v); - thread_sumval *= expf((old_maxval - thread_maxval)); + thread_sumval *= expf(old_maxval - thread_maxval); thread_sumval += expf(v - thread_maxval); } @@ -390,16 +390,16 @@ __device__ SoftmaxParams prepare_softmax_blockwide2(int idx, const float* inp, i float thread_sumval = 0.0f; // do the loop in reverse to maximise probability of L2 cache hits // so even small L2s get some hits on the 2nd read of the same thread - for (int i = (V+3)/4 + threadIdx.x - blockDim.x; i >= 0; i -= blockDim.x) { + for (int i = ceil_div(V, f128::size) + threadIdx.x - blockDim.x; i >= 0; i -= blockDim.x) { f128 packed_x = load128cs(x + i * f128::size); // load and do not keep in cache for(int k = 0; k < packed_x.size; ++k) { - if (i*4+k >= V) { // bounds checking against real V + if (i*f128::size+k >= V) { // bounds checking against real V continue; } float v = (float)packed_x[k]; float old_maxval = thread_maxval; thread_maxval = fmaxf(thread_maxval, v); - thread_sumval *= expf((old_maxval - thread_maxval)); + thread_sumval *= expf(old_maxval - thread_maxval); thread_sumval += expf(v - thread_maxval); } } @@ -457,7 +457,7 @@ __global__ void fused_classifier_kernel4(float* dlogits, float* losses, float* p // calculate the gradients directly, saves bandwidth from probs during training // but also supports writing probs for inference-only and debugging const float* logits_vec = logits + idx * P; - for (int i = threadIdx.x; i < (V+f128::size-1)/f128::size; i += blockDim.x) { + for (int i = threadIdx.x; i < ceil_div(V , f128::size); i += blockDim.x) { // this is the 2nd read of logits after the one in prepare_softmax2 // this data will never be needed again, so we reduce cache persistence f128 packed_logits_vec = load128cs(logits_vec + i * f128::size); // load and do not keep in cache From 2bfd2b6a7e3881d65414b50fdd87d37656b12f91 Mon Sep 17 00:00:00 2001 From: Ross Wheeler Date: Mon, 6 May 2024 00:46:28 -0700 Subject: [PATCH 023/172] Auto-detect GPU capability 3 cases tested on Windows 11 and Ubuntu 22.04 --- Makefile | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/Makefile b/Makefile index 04cbfbb2a..3ec092466 100644 --- a/Makefile +++ b/Makefile @@ -23,6 +23,23 @@ NVCC_CUDNN = # because it bloats up the compile time from a few seconds to ~minute USE_CUDNN ?= 0 +# Function to check if a file exists in the PATH +define file_exists_in_path + $(shell where $(1) 2>nul || which $(1) 2>/dev/null) +endef + +ifndef GPU_COMPUTE_CAPABILITY # set to defaults if: make GPU_COMPUTE_CAPABILITY= + ifneq ($(call file_exists_in_path, __nvcc_device_query),) + GPU_COMPUTE_CAPABILITY = $(shell __nvcc_device_query) + GPU_COMPUTE_CAPABILITY := $(strip $(GPU_COMPUTE_CAPABILITY)) + endif +endif + +# set to defaults if - make GPU_COMPUTE_CAPABILITY= otherwise use the compute capability detected above +ifneq ($(GPU_COMPUTE_CAPABILITY),) + NVCC_FLAGS += --generate-code arch=compute_$(GPU_COMPUTE_CAPABILITY),code=[compute_$(GPU_COMPUTE_CAPABILITY),sm_$(GPU_COMPUTE_CAPABILITY)] +endif + # autodect a lot of various supports on current platform $(info ---------------------------------------------) From 0c4908d8153e051d5d33deff6a3d799da973041d Mon Sep 17 00:00:00 2001 From: Ross Wheeler Date: Mon, 6 May 2024 01:11:27 -0700 Subject: [PATCH 024/172] Adding CI check to disable auto-detect --- Makefile | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 3ec092466..114568910 100644 --- a/Makefile +++ b/Makefile @@ -28,10 +28,12 @@ define file_exists_in_path $(shell where $(1) 2>nul || which $(1) 2>/dev/null) endef -ifndef GPU_COMPUTE_CAPABILITY # set to defaults if: make GPU_COMPUTE_CAPABILITY= - ifneq ($(call file_exists_in_path, __nvcc_device_query),) - GPU_COMPUTE_CAPABILITY = $(shell __nvcc_device_query) - GPU_COMPUTE_CAPABILITY := $(strip $(GPU_COMPUTE_CAPABILITY)) +ifneq ($(CI),true) # if not in CI, then use the GPU query + ifndef GPU_COMPUTE_CAPABILITY # set to defaults if: make GPU_COMPUTE_CAPABILITY= + ifneq ($(call file_exists_in_path, __nvcc_device_query),) + GPU_COMPUTE_CAPABILITY = $(shell __nvcc_device_query) + GPU_COMPUTE_CAPABILITY := $(strip $(GPU_COMPUTE_CAPABILITY)) + endif endif endif From 7a8f471bc77f90ac8385ad968721998cddb7ecbb Mon Sep 17 00:00:00 2001 From: chinthysl Date: Mon, 6 May 2024 17:49:58 +0800 Subject: [PATCH 025/172] further reorganization --- profile_gpt2.cu | 2 +- test_gpt2.cu | 2 +- train_gpt2.cu | 74 ++++++++++++++++++++++++++++++++++--------------- 3 files changed, 53 insertions(+), 25 deletions(-) diff --git a/profile_gpt2.cu b/profile_gpt2.cu index 97ea3ce09..c29cd6a08 100644 --- a/profile_gpt2.cu +++ b/profile_gpt2.cu @@ -53,7 +53,7 @@ int main() { gpt2_forward(&model, x, y, B, T); gpt2_zero_grad(&model); gpt2_backward(&model); - gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1, model.num_parameters, 0); + gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1); cudaCheck(cudaDeviceSynchronize()); // finish all CUDA work to get correct precise timings // free diff --git a/test_gpt2.cu b/test_gpt2.cu index dfe5486cc..d7944125c 100644 --- a/test_gpt2.cu +++ b/test_gpt2.cu @@ -255,7 +255,7 @@ int main(int argc, char *argv[]) { allok = allok & check_tensor(tensors1[15], tensors2[15], C, "lnfb", 3e-2f); } - gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.01f, step+1, model.num_parameters, 0); + gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.01f, step+1); // print the timing information at the end printf("step %d: loss %f (took %f ms)\n", step+1, model.mean_loss, time_elapsed_s * 1000); diff --git a/train_gpt2.cu b/train_gpt2.cu index 0c28eff4f..adcd2812f 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -2088,6 +2088,7 @@ void gpt2_backward(GPT2 *model) { // Compute a mean of a single CPU value across all GPU processes. No-op when multi-GPU is disabled. float multi_gpu_cpu_float_mean(float value, const MultiGpuConfig* multi_gpu_config) { #ifdef MULTI_GPU + if (multi_gpu_config->num_processes == 1) return value; // MPI doesn't support all reduce with mean, so we sum up, then divide. float result; mpiCheck(MPI_Allreduce(&value, &result, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD)); @@ -2104,11 +2105,7 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) { // Average all losses. model->accumulated_mean_loss = multi_gpu_cpu_float_mean(model->mean_loss, multi_gpu_config); #ifdef MULTI_GPU - // all gather is only required when num_processes > 1 - if (multi_gpu_config->num_processes == 1) { - return; - } - + if (multi_gpu_config->num_processes == 1) return; // Average all gradients. ncclCheck(ncclAllReduce(model->grads_memory, model->grads_memory, model->num_parameters, @@ -2119,18 +2116,18 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) { #endif } -void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t, size_t shard_num_parameters, size_t shard_offset) { +void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t) { NVTX_RANGE_FN(); // reference: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html - // lazily allocate the memory for m_memory and v_memory according to shard configs + // lazily allocate the memory for m_memory and v_memory if (model->m_memory == NULL) { - cudaCheck(cudaMalloc((void**)&model->m_memory, shard_num_parameters * sizeof(float))); - cudaCheck(cudaMalloc((void**)&model->v_memory, shard_num_parameters * sizeof(float))); - cudaCheck(cudaMemset(model->m_memory, 0, shard_num_parameters * sizeof(float))); - cudaCheck(cudaMemset(model->v_memory, 0, shard_num_parameters * sizeof(float))); - printf0("allocated %zu MiB for AdamW optimizer state m\n", (shard_num_parameters * sizeof(float)) >> 20); - printf0("allocated %zu MiB for AdamW optimizer state v\n", (shard_num_parameters * sizeof(float)) >> 20); + cudaCheck(cudaMalloc((void**)&model->m_memory, model->num_parameters * sizeof(float))); + cudaCheck(cudaMalloc((void**)&model->v_memory, model->num_parameters * sizeof(float))); + cudaCheck(cudaMemset(model->m_memory, 0, model->num_parameters * sizeof(float))); + cudaCheck(cudaMemset(model->v_memory, 0, model->num_parameters * sizeof(float))); + printf0("allocated %zu MiB for AdamW optimizer state m\n", (model->num_parameters * sizeof(float)) >> 20); + printf0("allocated %zu MiB for AdamW optimizer state v\n", (model->num_parameters * sizeof(float)) >> 20); if (model->use_master_weights == 1) { // allocate one more buffer to keep the master copy of weights as float, and copy the weights over cudaCheck(cudaMalloc((void**)&model->master_weights, model->num_parameters * sizeof(float))); @@ -2140,19 +2137,49 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo } } - floatX* params_memory = (floatX*)model->params_memory + shard_offset; - floatX* grads_memory = (floatX*)model->grads_memory + shard_offset; + int block_size = 512; + int num_blocks = CEIL_DIV(model->num_parameters, block_size); + float beta1_correction = 1.0f - powf(beta1, t); + float beta2_correction = 1.0f - powf(beta2, t); + unsigned int seed = random_u32(&model->rng_state); + adamw_kernel3<<>>((floatX*)model->params_memory, model->master_weights, + (floatX*)model->grads_memory, model->m_memory, model->v_memory, + model->num_parameters, + learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed); + cudaCheck(cudaGetLastError()); +} + +void gpt2_multi_gpu_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t, MultiGpuConfig* multi_gpu_config) { + NVTX_RANGE_FN(); + if (model->m_memory == NULL) { + cudaCheck(cudaMalloc((void**)&model->m_memory, multi_gpu_config->shard_num_parameters * sizeof(float))); + cudaCheck(cudaMalloc((void**)&model->v_memory, multi_gpu_config->shard_num_parameters* sizeof(float))); + cudaCheck(cudaMemset(model->m_memory, 0, multi_gpu_config->shard_num_parameters * sizeof(float))); + cudaCheck(cudaMemset(model->v_memory, 0, multi_gpu_config->shard_num_parameters * sizeof(float))); + printf0("allocated %zu MiB for AdamW optimizer state m\n", (multi_gpu_config->shard_num_parameters * sizeof(float)) >> 20); + printf0("allocated %zu MiB for AdamW optimizer state v\n", (multi_gpu_config->shard_num_parameters * sizeof(float)) >> 20); + if (model->use_master_weights == 1) { + cudaCheck(cudaMalloc((void**)&model->master_weights, model->num_parameters * sizeof(float))); + copy_and_cast_kernel<<num_parameters, 512), 512, 0, main_stream>>>(model->master_weights, (floatX*)model->params_memory, model->num_parameters); + cudaCheck(cudaGetLastError()); + printf0("allocated %zu MiB for master copy of params\n", (model->num_parameters * sizeof(float)) >> 20); + } + } + + floatX* params_memory = (floatX*)model->params_memory + multi_gpu_config->shard_offset; + floatX* grads_memory = (floatX*)model->grads_memory + multi_gpu_config->shard_offset; float* master_weights = NULL; if (model->use_master_weights == 1) { - master_weights = model->master_weights + shard_offset; + master_weights = model->master_weights + multi_gpu_config->shard_offset; } int block_size = 512; - int num_blocks = CEIL_DIV(shard_num_parameters, block_size); + int num_blocks = CEIL_DIV(multi_gpu_config->shard_num_parameters, block_size); float beta1_correction = 1.0f - powf(beta1, t); float beta2_correction = 1.0f - powf(beta2, t); unsigned int seed = random_u32(&model->rng_state); - adamw_kernel3<<>>(params_memory, master_weights, grads_memory, model->m_memory, model->v_memory, shard_num_parameters, + adamw_kernel3<<>>(params_memory, master_weights, grads_memory, + model->m_memory, model->v_memory, multi_gpu_config->shard_num_parameters, learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed); cudaCheck(cudaGetLastError()); } @@ -2160,10 +2187,7 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo void gpt2_multi_gpu_gather(GPT2 *model, MultiGpuConfig* multi_gpu_config) { #ifdef MULTI_GPU - // all gather is only required when num_processes > 1 - if (multi_gpu_config->num_processes == 1) { - return; - } + if (multi_gpu_config->num_processes == 1) return; if (multi_gpu_config->zero_stage == 1) { // gather all parameter updates from each process @@ -2616,9 +2640,13 @@ int main(int argc, char *argv[]) { gpt2_forward(&model, train_loader.inputs, train_loader.targets, B, T, false); gpt2_zero_grad(&model); gpt2_backward(&model); +#ifndef MULTI_GPU + gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1); +#else gpt2_multi_gpu_accumulate(&model, &multi_gpu_config); - gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1, multi_gpu_config.shard_num_parameters, multi_gpu_config.shard_offset); + gpt2_multi_gpu_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1, &multi_gpu_config); gpt2_multi_gpu_gather(&model, &multi_gpu_config); +#endif // todo - move or double-buffer all of this timing logic to avoid idling the GPU at this point! cudaEventRecord(end); From 134f4c7fc8e9092f09d30ad15efa979600e4e656 Mon Sep 17 00:00:00 2001 From: Ross Wheeler Date: Mon, 6 May 2024 12:38:40 -0700 Subject: [PATCH 026/172] Adding two directory search for cuDNN frontend files Search in $(HOME) and in the current directory. --- Makefile | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index 253e5d986..c4879588b 100644 --- a/Makefile +++ b/Makefile @@ -94,23 +94,33 @@ endif # and then we include it below (see currently hard-coded path assumed in home directory) ifeq ($(USE_CUDNN), 1) ifeq ($(SHELL_UNAME), Linux) - # hard-coded path for now - CUDNN_FRONTEND_PATH ?= $(HOME)/cudnn-frontend/include - ifeq ($(shell [ -d $(CUDNN_FRONTEND_PATH) ] && echo "exists"), exists) + # hard-coded path for now in either . or ($HOME) directory + # this can be overridden by setting CUDNN_FRONTEND_PATH on the command line + ifeq ($(shell [ -d $(HOME)/cudnn-frontend/include ] && echo "exists"), exists) $(info ✓ cuDNN found, will run with flash-attention) - NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH) - NVCC_LDFLAGS += -lcudnn - NVCC_FLAGS += -DENABLE_CUDNN - NVCC_CUDNN = cudnn_att.o + CUDNN_FRONTEND_PATH ?= $(HOME)/cudnn-frontend/include + else ifeq ($(shell [ -d cudnn-frontend/include ] && echo "exists"),) + $(info ✓ cuDNN found, will run with flash-attention) + CUDNN_FRONTEND_PATH ?= cudnn-frontend/include else $(error ✗ cuDNN not found. See the Makefile for our currently hard-coded paths / install instructions) endif - else + NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH) + NVCC_LDFLAGS += -lcudnn + NVCC_FLAGS += -DENABLE_CUDNN + NVCC_CUDNN = cudnn_att.o + else ifneq ($(OS), Windows_NT) $(info → cuDNN is not supported on MAC OS right now) else $(info ✓ Windows cuDNN found, will run with flash-attention) - CUDNN_FRONTEND_PATH ?= ..\..\cudnn-frontend\include #override on command line if different location + ifeq ($(shell if exist "$(HOMEDRIVE)$(HOMEPATH)\cudnn-frontend\include" (echo exists)),exists) + CUDNN_FRONTEND_PATH ?= $(HOMEDRIVE)$(HOMEPATH)\cudnn-frontend\include #override on command line if different location + else ifeq ($(shell if exist "cudnn-frontend\include" (echo exists)),exists) + CUDNN_FRONTEND_PATH ?= cudnn-frontend\include #override on command line if different location + else + $(error ✗ cuDNN not found. See the Makefile for our currently hard-coded paths / install instructions) + endif CUDNN_INCLUDE_PATH ?= -I"C:\Program Files\NVIDIA\CUDNN\v9.1\include\12.4" CUDNN_FRONTEND_PATH += $(CUDNN_INCLUDE_PATH) NVCC_FLAGS += --std c++20 -Xcompiler "/std:c++20" -Xcompiler "/EHsc /W0 /nologo /Ox /FS" -maxrregcount=0 --machine 64 @@ -214,7 +224,7 @@ ifeq ($(NVCC),) $(info ✗ nvcc not found, skipping GPU/CUDA builds) else $(info ✓ nvcc found, including GPU/CUDA support) - TARGETS += train_gpt2cu test_gpt2cu train_gpt2fp32cu test_gpt2fp32cu + TARGETS += train_gpt2cu test_gpt2cu train_gpt2fp32cu test_gpt2fp32cu $(NVCC_CUDNN) endif $(info ---------------------------------------------) From 69f6c4f765cdebceede2b32501d765b68fcc30cc Mon Sep 17 00:00:00 2001 From: Horace He Date: Mon, 6 May 2024 13:10:17 -0700 Subject: [PATCH 027/172] Don't return logits during training for PyTorch baseline This improves perf somewhat, since currently it's always returning logits (which thus need to be materialized). --- train_gpt2.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/train_gpt2.py b/train_gpt2.py index f2fa68c9b..1446e6165 100644 --- a/train_gpt2.py +++ b/train_gpt2.py @@ -147,12 +147,11 @@ def forward(self, idx, targets=None): # if we are given some desired targets also calculate the loss logits = self.lm_head(x) loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1) + return None, loss else: # inference-time mini-optimization: only forward the lm_head on the very last position logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim - loss = None - - return logits, loss + return logits, None @classmethod def from_pretrained(cls, model_type): From 5adb6ef2c1223203bbfbe0a43f573dfa503c68e7 Mon Sep 17 00:00:00 2001 From: ademeure Date: Mon, 6 May 2024 21:24:42 +0100 Subject: [PATCH 028/172] Add tensor core and overall efficiency stats to profiler script. --- profile_gpt2cu.py | 44 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/profile_gpt2cu.py b/profile_gpt2cu.py index 8e8847369..d9dbd4f8e 100644 --- a/profile_gpt2cu.py +++ b/profile_gpt2cu.py @@ -39,7 +39,8 @@ "dram__bytes_write.sum", # DRAM writes "lts__t_sectors_srcunit_tex_op_read.sum", # L2 reads (sectors -- 32B) "lts__t_sectors_srcunit_tex_op_write.sum", # L2 reads (sectors -- 32B) - "smsp__inst_executed.sum", # instructions + "sm__pipe_tensor_op_hmma_cycles_active.avg.pct_of_peak_sustained_active", # % of peak tensor core utilization + "smsp__inst_executed.sum", # instructions ] cmd = [NCU, "-i", "profile.ncu-rep", "--csv", "--page", "raw", "--metrics", ",".join(metrics)] result = subprocess.check_output(cmd, text=True).strip() @@ -72,16 +73,35 @@ assert CLS_START != -1 +# Check every kernel to find the maximum DRAM bandwidth and Tensor Core utilisation values +max_dram_bw = 0.0 +max_tensor = 0.0 +for rid, row in kernel_profile_data: + if rid <= 2: + continue + time = float(row[13]) + read = float(row[11]) + write = float(row[12]) + tensor = float(row[16]) + dram_bw = (read + write) / (time / 1000.0) + max_dram_bw = max(max_dram_bw, dram_bw) + max_tensor = max(max_tensor, tensor) + +# round the maximum tensor core utilisation to 50% or 100% +# consumer GPUs can only achieve 50% of peak tensor throughput on this counter +# and for GPUs without tensor cores, we set the value to 50% to avoid division by zero +max_tensor = (max_tensor > 50.0) and 100.0 or 50.0 + print() print("Kernel calls:") for rid, row in kernel_profile_data: if rid == 0: # headings - print(f"id pass {'name':<40} {'time':>8} {'RAM rd':>8} {'RAM wt':>8} {'L2 rd':>8} {'L2 wt':>8} {'inst':>8}") + print( f"id pass {'name':<40} {'time':>8} {'RAM BW':>8} {'tensor':>8} {'RAM rd':>8} {'RAM wt':>8} {'L2 rd':>8} {'L2 wt':>8} {'inst':>8}") continue if rid == 1: # units - units = f" {'':<40} {'ms':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'MInst':>8}" + units = f" {'':<40} {'ms':>8} {'GB/s':>8} {'core %':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'MInst':>8}" print(units) print("." * len(units)) continue @@ -95,7 +115,9 @@ write = float(row[12]) l2_read = float(row[14]) l2_write = float(row[15]) - inst = float(row[16]) / 1e6 + tensor = float(row[16]) + inst = float(row[17]) / 1e6 + dram_bw = (read + write) / (time / 1000.0) kid = rid - 2 @@ -149,6 +171,7 @@ l2_read = l2_read * 32 / 1024 / 1024 / 1024 l2_write = l2_write * 32 / 1024 / 1024 / 1024 + efficiency = max(dram_bw / max_dram_bw, tensor / max_tensor) summaries[fn_name] += time counts[fn_name] += multiplier passes[pass_name] += time @@ -159,13 +182,18 @@ total['l2_read'] += l2_read total['l2_write'] += l2_write total['inst'] += inst + total['tensor'] += tensor * time # % so multiplied by time + total['efficiency'] += efficiency * time pass_info = f"{pass_name}×{multiplier}" - print(f"{kid:02} {pass_info:7} {fn_name:<40} {time:8.2f} {read:8.2f} {write:8.2f} {l2_read:8.2f} {l2_write:8.2f} {inst:8.2f}") + print(f"{kid:02} {pass_info:7} {fn_name:<40} {time:8.2f} {dram_bw:8.1f} {tensor:8.1f} {read:8.2f} {write:8.2f} {l2_read:8.2f} {l2_write:8.2f} {inst:8.2f}") + total_time = total['time'] +avg_dram_bw = (total['read'] + total['write']) / (total_time / 1000.0) +avg_tensor_util = total['tensor'] / total_time print("." * len(units)) -print(f" {'Total':<40} {total['time']:8.2f} {total['read']:8.2f} {total['write']:8.2f} {total['l2_read']:8.2f} {total['l2_write']:8.2f} {total['inst']:8.2f}") +print(f" {'Total':<40} {total['time']:8.2f} {avg_dram_bw:8.1f} {avg_tensor_util:8.1f} {total['read']:8.2f} {total['write']:8.2f} {total['l2_read']:8.2f} {total['l2_write']:8.2f} {total['inst']:8.2f}") print() print("Kernel type summaries:") @@ -192,5 +220,9 @@ We read {total['read']:.1f}GiB ({total['read']/ts:.1f}GB/s) and write {total['write']:.1f}GiB ({total['write']/ts:.1f}GB/s) to DRAM, read {total['l2_read']:.1f}GiB ({total['l2_read']/ts:.1f}GB/s) and write {total['l2_write']:.1f}GiB ({total['l2_write']/ts:.1f}GB/s) to L2, and execute {total['inst'] / 1000:.1f} billion instructions ({total['inst'] / 1000 / ts:.1f} GInst/s). + +Assuming that every kernel should be either fully DRAM bandwidth or tensor core limited, +with a peak DRAM bandwidth of {max_dram_bw:.1f}GB/s and a peak tensor throughput of {max_tensor:.1f}%, +our overall efficiency is {(total['efficiency'] * 100.0 / total_time):.1f}%. """ print(summary) \ No newline at end of file From f7d77600845537597654e4ace1ef0914ab413277 Mon Sep 17 00:00:00 2001 From: Ross Wheeler Date: Mon, 6 May 2024 13:56:17 -0700 Subject: [PATCH 029/172] Non-standard C syntax replace with standard C Also, adding unistd.h back because that has some changes required to find M_PI. --- train_gpt2.cu | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index 86ceb5caa..7a73cb612 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -34,6 +34,7 @@ This reads & runs in fp32, B=4, T=64, LR=1e-4, val/sample never (200), -a 1 is "overfit single batch", -x 10 is 10 iterations, and -f 0 disables tf32 */ +#include #include #include #include @@ -53,6 +54,7 @@ This reads & runs in fp32, B=4, T=64, LR=1e-4, val/sample never (200), #include "utils.h" // defines: tokenizer_init, tokenizer_decode, tokenizer_free #include "tokenizer.h" +#undef FLT_MAX // ---------------------------------------------------------------------------- // CUDA precision settings @@ -171,7 +173,7 @@ void mpi_check(int status, const char *file, int line) { // we need to be careful here to only define our own versions if none already exist, otherwise the compiler will // complain. // If not, you easily get "no viable overload" (for sm52) and "function already exists" (sm_80) -#if defined(ENABLE_BF16) and __CUDACC_VER_MAJOR__ < 12 and not(__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) +#if defined(ENABLE_BF16) && (__CUDACC_VER_MAJOR__ < 12) && !((__CUDA_ARCH__ >= 800) || !defined(__CUDA_ARCH__)) __device__ floatX __ldcs(const floatX* address) { unsigned short bf = __ldcs(reinterpret_cast(address)); return __nv_bfloat16_raw{bf}; @@ -489,12 +491,12 @@ __global__ void encoder_forward_kernel3(floatX* out, } template -__device__ void atomicStochasticAdd(T* address, float val0, float val1, uint seed) { +__device__ void atomicStochasticAdd(T* address, float val0, float val1, unsigned int seed) { static_assert(sizeof(T) == 2, "Only 16-bit atomicStochasticAdd supported."); float2 val = make_float2(val0, val1); - uint* address_as_uint = (uint*)address; - uint old = *address_as_uint, assumed; - uint random = Get2dNoiseUint(threadIdx.x, blockIdx.x, seed); + unsigned int* address_as_uint = (unsigned int*)address; + unsigned int old = *address_as_uint, assumed; + unsigned int random = Get2dNoiseUint(threadIdx.x, blockIdx.x, seed); do { assumed = old; float2 new_fp32 = make_float2((float)(reinterpret_cast(&old)[0]) + val.x, @@ -502,17 +504,17 @@ __device__ void atomicStochasticAdd(T* address, float val0, float val1, uint see T new_rounded[2]; stochastic_rounding(new_fp32.x, &new_rounded[0], random); stochastic_rounding(new_fp32.y, &new_rounded[1], random >> 16); - old = atomicCAS(address_as_uint, assumed, *(uint*)&new_rounded); + old = atomicCAS(address_as_uint, assumed, *(unsigned int*)&new_rounded); } while (assumed != old); } -__device__ void atomicStochasticAdd(float* address, float val0, float val1, uint seed) { +__device__ void atomicStochasticAdd(float* address, float val0, float val1, unsigned int seed) { atomicAdd(address, val0); atomicAdd(address + 1, val1); } __global__ void encoder_backward_kernel(floatX* dwte, floatX* dwpe, const floatX* dout, const int* inp, - int B, int T, int C, uint seed) { + int B, int T, int C, unsigned int seed) { int idx = blockIdx.x * blockDim.x + threadIdx.x; int N = B * T * C; idx *= 2; // 2 elements per thread @@ -1090,7 +1092,7 @@ void encoder_forward(floatX* out, void encoder_backward(floatX* dwte, floatX* dwpe, const floatX* dout, const int* inp, - int B, int T, int C, uint seed) { + int B, int T, int C, unsigned int seed) { NVTX_RANGE_FN(); const int N = B * T * C; const int block_size = 256; From 9b55ea8cf5b6ad5796b8b11f8c2d6fc4da2f6161 Mon Sep 17 00:00:00 2001 From: ademeure Date: Tue, 7 May 2024 00:11:42 +0100 Subject: [PATCH 030/172] More crazy optimisations to layernorm_backward, fused_classifier, and matmul_backward_bias. --- dev/cuda/classifier_fused.cu | 222 ++++++++++++++++++++++++++---- dev/cuda/layernorm_backward.cu | 142 +++++++++++++++++++- train_gpt2.cu | 238 ++++++++++++++++++++++----------- 3 files changed, 499 insertions(+), 103 deletions(-) diff --git a/dev/cuda/classifier_fused.cu b/dev/cuda/classifier_fused.cu index 9202c2cee..522ac5135 100644 --- a/dev/cuda/classifier_fused.cu +++ b/dev/cuda/classifier_fused.cu @@ -21,6 +21,21 @@ nvcc -O3 --use_fast_math -lcublas -lcublasLt classifier_fused.cu -o classifier_f #include #include "common.h" +// todo - this file does not properly support anything but FP32 +// kernel 5 can be run in fp16/bf16 to test performance, but the outputs will be wrong +#undef ENABLE_BF16 +#undef ENABLE_FP16 +#define ENABLE_BF16 + +#if defined(ENABLE_BF16) +typedef __nv_bfloat16 floatX; +#elif defined(ENABLE_FP16) +typedef half floatX; +#else +typedef float floatX; +#endif +typedef Packed128 x128; + // ---------------------------------------------------------------------------- // CPU code reference @@ -382,18 +397,18 @@ __global__ void fused_classifier_kernel3(float* dlogits, float* losses, float* p } } -__device__ SoftmaxParams prepare_softmax_blockwide2(int idx, const float* inp, int V, int P) { +__device__ SoftmaxParams prepare_softmax_blockwide2(int idx, const floatX* inp, int V, int P) { // one row of inp, i.e. inp[idx, :] of shape (V,) - const float* x = inp + idx * P; + const floatX* x = inp + idx * P; float thread_maxval = -INFINITY; float thread_sumval = 0.0f; // do the loop in reverse to maximise probability of L2 cache hits // so even small L2s get some hits on the 2nd read of the same thread - for (int i = ceil_div(V, f128::size) + threadIdx.x - blockDim.x; i >= 0; i -= blockDim.x) { - f128 packed_x = load128cs(x + i * f128::size); // load and do not keep in cache + for (int i = ceil_div(V, x128::size) + threadIdx.x - blockDim.x; i >= 0; i -= blockDim.x) { + x128 packed_x = load128cs(x + i * x128::size); // load and do not keep in cache for(int k = 0; k < packed_x.size; ++k) { - if (i*f128::size+k >= V) { // bounds checking against real V + if (i*x128::size+k >= V) { // bounds checking against real V continue; } float v = (float)packed_x[k]; @@ -436,9 +451,9 @@ __device__ SoftmaxParams prepare_softmax_blockwide2(int idx, const float* inp, i return SoftmaxParams{1.f / block_sumval, block_maxval}; } -// same as 2 but not using float4 -__global__ void fused_classifier_kernel4(float* dlogits, float* losses, float* probs, - const float* logits, const float* dlosses, const int* targets, +// same as 2 but using x128 +__global__ void fused_classifier_kernel4(floatX* dlogits, floatX* losses, floatX* probs, + const floatX* logits, const floatX* dlosses, const int* targets, int B, int T, int V, int P) { int idx = blockIdx.x; int ix = targets[idx]; @@ -448,21 +463,21 @@ __global__ void fused_classifier_kernel4(float* dlogits, float* losses, float* p // calculate the probability needed for the loss and update (single-threaded) if(threadIdx.x == 0) { - float prob = expf(logits[idx * P + ix] - sp.Offset) * sp.Scale; + float prob = expf((float)logits[idx * P + ix] - sp.Offset) * sp.Scale; losses[idx] = -logf(prob); } // very sensible default for dlosses is 1/(B*T), which is the uniform loss - float dloss = dlosses != NULL ? dlosses[idx] : 1.0f / (B*T); + float dloss = dlosses != NULL ? (float)dlosses[idx] : 1.0f / (B*T); // calculate the gradients directly, saves bandwidth from probs during training // but also supports writing probs for inference-only and debugging - const float* logits_vec = logits + idx * P; - for (int i = threadIdx.x; i < ceil_div(V , f128::size); i += blockDim.x) { + const floatX* logits_vec = logits + idx * P; + for (int i = threadIdx.x; i < ceil_div(V , x128::size); i += blockDim.x) { // this is the 2nd read of logits after the one in prepare_softmax2 // this data will never be needed again, so we reduce cache persistence - f128 packed_logits_vec = load128cs(logits_vec + i * f128::size); // load and do not keep in cache - f128 packed_probs; - f128 packed_dlogits; + x128 packed_logits_vec = load128cs(logits_vec + i * x128::size); // load and do not keep in cache + x128 packed_probs; + x128 packed_dlogits; for(int k = 0; k < packed_logits_vec.size; ++k) { int element = i*packed_logits_vec.size + k; if (element >= V) { // bounds checking against real V @@ -474,6 +489,7 @@ __global__ void fused_classifier_kernel4(float* dlogits, float* losses, float* p float indicator = (element == ix) ? 1.0f : 0.0f; packed_dlogits[k] = (prob - indicator) * dloss; } + // Note: missing .cs hint hurts our performance due to cache thrashing, fixed in kernel5 store128(dlogits + idx * P + i * packed_logits_vec.size, packed_dlogits); if (probs != NULL) { store128(probs + idx * P + i * packed_logits_vec.size, packed_probs); @@ -481,6 +497,143 @@ __global__ void fused_classifier_kernel4(float* dlogits, float* losses, float* p } } +// todo - move to common.h - or ideally somewhere it's not duplicated between train & common? +// requires all 32 threads in the warp to be active, but should work for any block size +// uses non-dynamic shared memory so every call increases shared memory requirements by 128 bytes +// the fact it's unique shared memory allows us to avoid an extra __syncthreads() call at the end +// but if called inside a loop, the shared memory will be implicitly reused, so set final_sync to 1 +using reduction_func_t = float (*) (float); +template +__device__ float blockReduce(float val, bool final_sync=false, float out_of_bounds=0.0f) { + // two reductions of up to 1024 threads: + // 1) inside warp (shuffle), 2) cross-warp (shared memory), 3) inside warp (shuffle) + __shared__ float shared_val[32]; + const int lane_id = threadIdx.x % 32; + const int warp_id = threadIdx.x / 32; + const int num_warps = blockDim.x / 32; + + float warp_val = warp_reduction(val); + if (lane_id == 0) { shared_val[warp_id] = warp_val; } + __syncthreads(); + warp_val = (lane_id < num_warps) ? shared_val[lane_id] : out_of_bounds; + float block_val = warp_reduction(warp_val); + + if (final_sync) { + __syncthreads(); // only needed in loops when effectively reusing shared memory etc. + } + return block_val; +} + +__device__ SoftmaxParams prepare_softmax_blockwide3(int idx, const floatX* inp, int V, int P) { + // same but not float4 + // one row of inp, i.e. inp[idx, :] of shape (V,) + + const floatX* x = inp + idx * P; + float thread_maxval = -INFINITY; + float thread_sumval = 0.0f; + int i = (V+x128::size-1)/x128::size + threadIdx.x - blockDim.x; + + // special-case loop to handle the unaligned elements at the end of the array + // this lets us skip the bounds check in the main loop below, which improves performance + while ((i+1)*x128::size > V) { + for(int k = 0; k < x128::size; ++k) { + if (i*x128::size+k >= V) { + break; // bounds checking against real V (rather than padded P) + } + float v = (float)x[i*x128::size+k]; + float old_maxval = thread_maxval; + thread_maxval = fmaxf(thread_maxval, v); + thread_sumval *= expf((old_maxval - thread_maxval)); + thread_sumval += expf(v - thread_maxval); + } + i -= blockDim.x; + } + + // main loop for the bulk of the iterations (no bounds checking required!) + for (; i >= 0; i -= blockDim.x) { + x128 packed_x = load128(x + i * x128::size); // load and keep in cache until fused_classifier loop + for(int k = 0; k < x128::size; ++k) { + float v = (float)packed_x[k]; + float old_maxval = thread_maxval; + thread_maxval = fmaxf(thread_maxval, v); + thread_sumval *= expf((old_maxval - thread_maxval)); + thread_sumval += expf(v - thread_maxval); + } + } + + // Block Max Reduction -> Maths -> Block Sum Reduction + float block_maxval = blockReduce(thread_maxval, false, -FLT_MAX); + thread_sumval *= expf(thread_maxval - block_maxval); + float block_sumval = blockReduce(thread_sumval); + + // return the softmax parameters + return SoftmaxParams{1.f / block_sumval, block_maxval}; +} + +// will _update_ logits to logit gradients +// uses template to decide whether to write logits and probs +// split both loops in "multiple-of-x128-size" and "bounds-checked remainder" parts +template +__global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS) + fused_classifier_kernel5(floatX* dlogits, floatX* losses, floatX* probs, + const floatX* logits, const floatX* dlosses, const int* targets, + int B, int T, int V, int P) { + int idx = blockIdx.x; + int ix = targets[idx]; + + // softmax (reading B * T * V, same logits read again below, hopefully still in cache) + SoftmaxParams sp = prepare_softmax_blockwide3(idx, logits, V, P); + + // calculate the probability needed for the loss and update (single-threaded) + if(threadIdx.x == 0) { + float prob = expf((float)logits[idx * P + ix] - sp.Offset) * sp.Scale; + losses[idx] = (floatX)(-logf(prob)); + } + + // very sensible default for dlosses is 1/(B*T), which is the uniform loss + float dloss = (dlosses != NULL) ? (float)dlosses[idx] : 1.0f / (B*T); + // calculate the gradients directly, saves bandwidth from probs during training + // but also supports writing probs for inference-only and debugging + const floatX* logits_vec = logits + idx * P; + int i = threadIdx.x; + for (; i < V/x128::size; i += blockDim.x) { + // this is the 2nd read of logits after the one in prepare_softmax2 + // it will be overwritten by the logits gradients which is when we reduce cache persistence + x128 packed_logits_vec = load128(logits_vec + i * x128::size); // rely on cs of store128cs + x128 packed_probs; + for(int k = 0; k < x128::size; ++k) { + int element = i*x128::size + k; + float prob = expf((float)packed_logits_vec[k] - sp.Offset) * sp.Scale; + packed_probs[k] = (floatX)prob; + float indicator = (element == ix) ? 1.0f : 0.0f; + packed_logits_vec[k] = (floatX)((prob - indicator) * dloss); + } + if (WriteLogits){ + // reduce cache persistence for the overwritten logits + // to maximise probability that logits remain in cache between prepare_softmax and here + store128cs(dlogits + idx * P + i * x128::size, packed_logits_vec); + } + if (WriteProbs) { + store128(probs + idx * P + i * x128::size, packed_probs); + } + } + + // handle remaining elements after the last multiple of x128::size + // e.g. if V = 8003, and x128::size = 8, we need to handle the last 3 elements + i *= x128::size; + for (; i < V; i++) { + float prob = expf((float)logits_vec[i] - sp.Offset) * sp.Scale; + float indicator = (i == ix) ? 1.0f : 0.0f; + float dlogit = (prob - indicator) * dloss; + if (WriteLogits){ + __stcs(dlogits + idx * P + i, (floatX)dlogit); + } + if (WriteProbs) { + probs[idx * P + i] = (floatX)prob; + } + } +} + // ---------------------------------------------------------------------------- // kernel launcher @@ -519,7 +672,16 @@ void fused_classifier4(float* dlogits, float* losses, int B, int T, int V, int P, int block_size) { const int N = B * T; const int grid_size = N; - fused_classifier_kernel4<<>>(dlogits, losses, NULL, logits, dlosses, targets, B, T, V, P); + fused_classifier_kernel4<<>>((floatX*)dlogits, (floatX*)losses, NULL, (floatX*)logits, (floatX*)dlosses, targets, B, T, V, P); + cudaCheck(cudaGetLastError()); +} + +void fused_classifier5(float* dlogits, float* losses, + const float* logits, const float* dlosses, const int* targets, + int B, int T, int V, int P, int block_size) { + const int N = B * T; + const int grid_size = N; + fused_classifier_kernel5<<>>((floatX*)dlogits, (floatX*)losses, NULL, (floatX*)logits, (floatX*)dlosses, targets, B, T, V, P); cudaCheck(cudaGetLastError()); } @@ -539,6 +701,9 @@ void fused_classifier(int kernel_num, float* dlogits, float* losses, case 4: fused_classifier4(dlogits, losses, logits, dlosses, targets, B, T, V, P, block_size); break; + case 5: + fused_classifier5(dlogits, losses, logits, dlosses, targets, B, T, V, P, block_size); + break; default: printf("Invalid kernel number\n"); exit(1); @@ -606,17 +771,22 @@ int main(int argc, char **argv) { crossentropy_forward_cpu(losses, probs, targets, B, T, V); crossentropy_softmax_backward_cpu(dlogits, dlosses, probs, targets, B, T, V); - // time the kernel at different block sizes - for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) { - int block_size = block_sizes[j]; - printf("Checking block size %d.\n", block_size); - fused_classifier(kernel_num, d_dlogits, d_losses, d_logits, d_dlosses, d_targets, B, T, V, P, block_size); - validate_result(d_losses, losses, "losses", B * T, 1e-4f); - // undo the padding before we can check for correctness - cudaCheck(cudaMemcpy2D(d_dlogits_no_pad, V * sizeof(float), d_dlogits, P * sizeof(float), V * sizeof(float), B * T, cudaMemcpyDeviceToDevice)); - validate_result(d_dlogits_no_pad, dlogits, "dlogits", B * T * V, 1e-4f); +#if defined(ENABLE_BF16) || defined(ENABLE_FP16) + if (kernel_num < 4) // kernel 4/5 + BF16 is only for testing performance, it doesn't do the format conversions yet etc... +#endif + { + // time the kernel at different block sizes + for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) { + int block_size = block_sizes[j]; + printf("Checking block size %d.\n", block_size); + fused_classifier(kernel_num, d_dlogits, d_losses, d_logits, d_dlosses, d_targets, B, T, V, P, block_size); + validate_result(d_losses, losses, "losses", B * T, 1e-4f); + // undo the padding before we can check for correctness + cudaCheck(cudaMemcpy2D(d_dlogits_no_pad, V * sizeof(float), d_dlogits, P * sizeof(float), V * sizeof(float), B * T, cudaMemcpyDeviceToDevice)); + validate_result(d_dlogits_no_pad, dlogits, "dlogits", B * T * V, 1e-4f); + } + printf("All results match. Starting benchmarks.\n\n"); } - printf("All results match. Starting benchmarks.\n\n"); for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) { int block_size = block_sizes[j]; diff --git a/dev/cuda/layernorm_backward.cu b/dev/cuda/layernorm_backward.cu index c1f01b0e6..1f432ba82 100644 --- a/dev/cuda/layernorm_backward.cu +++ b/dev/cuda/layernorm_backward.cu @@ -32,6 +32,7 @@ typedef half floatN; typedef float floatX; typedef float floatN; #endif +typedef Packed128 x128; // ---------------------------------------------------------------------------- // CPU code reference @@ -125,7 +126,7 @@ void layernorm_backward_cpu(float* dinp, float* dweight, float* dbias, // GPU kernels // GPU helper functions for atomicAdd on smaller than 32-bit types -__device__ floatX warpReduceSum(floatX val) { +__device__ float warpReduceSum(float val) { for (int offset = 16; offset > 0; offset /= 2) { val += __shfl_xor_sync(0xFFFFFFFF, val, offset); } @@ -751,6 +752,128 @@ __global__ void layernorm_backward_kernel7(floatX* dinp, floatX* dweight, floatX } } +__global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS) + layernorm_backward_kernel8(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch, + const floatX* dout, const floatX* inp, const floatX* weight, + const floatX* mean, const floatX* rstd, + int B, int T, int C) { + extern __shared__ float shared[]; // size = 2 * C + 1 + int warpId = threadIdx.x / warpSize; // warp index within a block + int warpsInBlock = blockDim.x / warpSize; //number of warps in block + int baseIdx = blockIdx.x * warpsInBlock + warpId; + int warpThreadIdx = threadIdx.x % warpSize; // Thread index within the warp + int warpsInGrid = gridDim.x * warpsInBlock; + int C_per_iteration = warpSize * x128::size; + int iterations_C = C / C_per_iteration; + + // the first half of shared memory is bias, second is weight + float* dbias_shared = shared; + float* dweight_shared = shared + C; + + // init shared memory to zero + for(int i = threadIdx.x; i < C; i+= blockDim.x){ + dbias_shared[i] = 0.0f; + dweight_shared[i] = 0.0f; + } + unsigned int *tmp_flag = (unsigned int*)(shared + C*2); + __syncthreads(); + + for (int idx = baseIdx; idx < B * T; idx += warpsInGrid) { + int b = idx / T; + int t = idx % T; + + const floatX* dout_bt = dout + b * T * C + t * C; + const floatX* inp_bt = inp + b * T * C + t * C; + floatX* dinp_bt = dinp + b * T * C + t * C; + const float mean_bt = (float)mean[b * T + t]; + const float rstd_bt = (float)rstd[b * T + t]; + + // first: two reduce operations + float dnorm_mean = 0.0f; + float dnorm_norm_mean = 0.0f; + for (int i = warpThreadIdx * x128::size; i < C; i += warpSize * x128::size) { + x128 dout128_i = load128(dout_bt + i); + x128 inp128_i = load128(inp_bt + i); + x128 weight128_i = load128(weight + i); + for (int k = 0; k < x128::size; k++) { + float norm_bti = ((float)inp128_i[k] - mean_bt) * rstd_bt; + float dnorm_i = (float)weight128_i[k] * (float)dout128_i[k]; + dnorm_mean += dnorm_i; + dnorm_norm_mean += dnorm_i * norm_bti; + } + } + dnorm_mean = warpReduceSum(dnorm_mean) / C; + dnorm_norm_mean = warpReduceSum(dnorm_norm_mean) / C; + + // now iterate again and accumulate all the gradients + // unfortunately we cannot use the same index for x128 arrays and shared memory + // as atomics can only be 32-bit rather than 128-bit (at least pre-SM90/Hopper) + // so this would result in an 8-way bank conflict, and kill performance + // so instead, we use a shared memory friendly index, and reorder before the final write + for (int i = 0; i < iterations_C; i++) { + int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration); + int shared_index = warpThreadIdx + (i * C_per_iteration); + x128 dout128 = load128cs(dout_bt + global_index); + x128 inp128 = load128cs(inp_bt + global_index); + x128 dinp128 = load128(dinp_bt + global_index); + x128 weight128 = load128(weight + global_index); + + for (int x = 0; x < x128::size; x++) { + float dout_i = (float)dout128[x]; + float norm_bti = ((float)inp128[x] - mean_bt) * rstd_bt; + float dnorm_i = (float)weight128[x] * dout_i; + // gradient contribution to bias (using shared memory friendly index) + atomicAdd(&dbias_shared[shared_index + x*warpSize], dout_i); + // gradient contribution to weight (using shared memory friendly index) + atomicAdd(&dweight_shared[shared_index + x*warpSize], norm_bti * dout_i); + // gradient contribution to input + float dval = 0.0f; + dval += dnorm_i; // term 1 + dval -= dnorm_mean; // term 2 + dval -= norm_bti * dnorm_norm_mean; // term 3 + dval *= rstd_bt; // final scale + dinp128[x] = (floatX)((float)dinp128[x] + dval); + } + // cache in L2 as this is read by the next kernel, but bypass L1 to minimise thrashing + store128cg(dinp_bt + global_index, dinp128); + } + } + // Accumulate into a FP32 scratchpad + // BF16 atomics are potentially much slower... and this is more precise! + // todo - could potentially avoid the extra copy if floatX is FP32, fairly negligible though + __syncthreads(); + float* scratch_dbias = scratch; + float* scratch_dweight = scratch + C; + unsigned int* scratchFlag = (unsigned int*)(scratch + (2 * C)); + for(int i = threadIdx.x; i < C; i+= blockDim.x) { + // global atomics in the same "shared memory banking friendly" order + atomicAdd(&scratch_dbias[i], dbias_shared[i]); + atomicAdd(&scratch_dweight[i], dweight_shared[i]); + } + __syncthreads(); + if (threadIdx.x == 0) { + *tmp_flag = atomicInc(scratchFlag, gridDim.x); + } + __syncthreads(); + if (*tmp_flag == gridDim.x-1) { + for (int i = warpId; i < iterations_C; i += warpsInBlock) { + // reorder from atomic/shared memory-friendly index to real global memory index + // and convert from float/FP32 to floatX/BF16 for the final write + int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration); + int shared_index = warpThreadIdx + (i * C_per_iteration); + + x128 dbias128; + x128 dweight128; + for (int x = 0; x < x128::size; x++) { + dbias128[x] = (floatX)scratch_dbias[shared_index + x*warpSize]; + dweight128[x] = (floatX)scratch_dweight[shared_index + x*warpSize]; + } + store128(dbias + global_index, dbias128); + store128(dweight + global_index, dweight128); + } + } +} + // ---------------------------------------------------------------------------- // kernel launchers @@ -828,6 +951,20 @@ void layernorm_backward7(Tdinp* dinp, Tparams* dweight, Tparams* dbias, float* s layernorm_backward_kernel7<<>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C); } +template +void layernorm_backward8(Tdinp* dinp, Tparams* dweight, Tparams* dbias, float* scratch, + const Tdout* dout, const Trest* inp, const Tparams* weight, const Trest* mean, const Trest* rstd, + int B, int T, int C, int block_size) { + const int grid_size = (1024/block_size) * cuda_num_SMs; + size_t shared_mem_size = (2 * C + 1) * sizeof(float); + + // Including this as part of the timing until we can parallelise it + // It should fully hide the cost and improve kernel perf by >5% if done in parallel using CUDA streams + cudaMemset(scratch, 0, (1 + 2 * C) * sizeof(float)); + + layernorm_backward_kernel8<<>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C); +} + // kernel version dispatch void layernorm_backward(int kernel_num, floatX* dinp, floatX* dweight, floatX* dbias, float* scratch, @@ -860,6 +997,9 @@ void layernorm_backward(int kernel_num, case 7: layernorm_backward7(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C, block_size); break; + case 8: + layernorm_backward8(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C, block_size); + break; default: printf("Invalid kernel number\n"); exit(1); diff --git a/train_gpt2.cu b/train_gpt2.cu index 86ceb5caa..c64cf7199 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -108,6 +108,14 @@ class NvtxRange { }; #define NVTX_RANGE_FN() NvtxRange nvtx_range(__FUNCTION__) +// try to make sure that 2 blocks fit on A100/H100 to maximise latency tolerance +// this needs to be defines rather than queried to be used for __launch_bounds__ +#if __CUDA_ARCH__ == 800 || __CUDA_ARCH__ >= 900 +#define MAX_1024_THREADS_BLOCKS 2 +#else +#define MAX_1024_THREADS_BLOCKS 1 +#endif + // cuBLAS workspace. Hardcoding to 32MiB but only Hopper needs 32, for others 4 is OK const size_t cublaslt_workspace_size = 32 * 1024 * 1024; void* cublaslt_workspace = NULL; @@ -271,6 +279,11 @@ template __device__ void store128cs(ElementType* target, Packed128 value) { __stcs(reinterpret_cast(target), value.get_bits()); } +// store a Packed128 to an aligned memory address while caching in L2 but bypassing L1 +template +__device__ void store128cg(ElementType* target, Packed128 value) { + __stcg(reinterpret_cast(target), value.get_bits()); +} // short-form typedefs typedef Packed128 f128; @@ -772,7 +785,7 @@ __global__ void gelu_backward_kernel(floatX* dinp, const floatX* inp, const floa store128(dinp + idx, packed_dinp); } -__global__ void matmul_backward_bias_kernel6(float* dbias, const floatX* dout, int B, int T, int OC) { +__global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, int B, int T, int OC) { // note: this kernel reads in floatX, but it writes to float! // this is because we're using atomics, which are super slow in < fp32 precision on < H100 GPUs // so the trick is do fp32 atomics to a buffer, and then copy_and_cast the result to floatX @@ -793,36 +806,48 @@ __global__ void matmul_backward_bias_kernel6(float* dbias, const floatX* dout, i accumulators[k] = 0.0f; } int thread_id = threadIdx.y * block_size_x + threadIdx.x; - for (int idx = thread_id; idx < OC_per_warp; idx += block_size) { - shared[idx] = 0.0f; + for (int i = thread_id; i < OC_per_warp; i += block_size) { + shared[i] = 0.0f; } __syncthreads(); - for (int idx = blockIdx.y*block_size_y + threadIdx.y; idx < B * T; idx += gridDim.y*block_size_y) { - x128 packed_dout = load128(dout + global_oc + idx*OC); + for (int i = blockIdx.y*block_size_y + threadIdx.y; i < B * T; i += gridDim.y*block_size_y) { + x128 packed_dout = load128(dout + global_oc + i*OC); for (int k = 0; k < x128::size; k++) { accumulators[k] += (float)packed_dout[k]; } } + // we need to avoid shared memory bank conflicts for the atomicAdd to maximise performance + // so we accumulate in a conflict-free order, then reorder to match the global memory order for (int k = 0; k < x128::size; k++) { - atomicAdd(shared + local_oc + k, accumulators[k]); + atomicAdd(shared + threadIdx.x + (k * block_size_x), accumulators[k]); } + if (threadIdx.y >= x128::size) { return; } // only need this many warps to reorder the data __syncthreads(); - if (threadIdx.y == 0) { - for (int idx = threadIdx.x; idx < OC_per_warp; idx += block_size_x) { - atomicAdd(dbias + idx + blockIdx.x*OC_per_warp, shared[idx]); - } - } + // read the accumulated values in the conflict-free order + int i = threadIdx.x + (threadIdx.y * block_size_x); + float tmp = shared[i]; + __syncthreads(); + // write them back to shared memory in the global memory order + // 8-way bank conflict for BF16 x128, but only 8x per threadblock (rather than 8x per warp) + shared[local_oc + threadIdx.y] = tmp; + __syncthreads(); + // now we do a perfectly coalesced atomic add to global memory (1x 128-byte cacheline per warp) + atomicAdd(dbias + i + blockIdx.x*OC_per_warp, shared[i]); } -__global__ void layernorm_backward_kernel7(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch, - const floatX* dout, const floatX* inp, const floatX* weight, const floatX* mean, const floatX* rstd, - int B, int T, int C) { +__global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS) + layernorm_backward_kernel8(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch, + const floatX* dout, const floatX* inp, const floatX* weight, + const floatX* mean, const floatX* rstd, + int B, int T, int C) { extern __shared__ float shared[]; // size = 2 * C + 1 int warpId = threadIdx.x / warpSize; // warp index within a block int warpsInBlock = blockDim.x / warpSize; //number of warps in block int baseIdx = blockIdx.x * warpsInBlock + warpId; int warpThreadIdx = threadIdx.x % warpSize; // Thread index within the warp int warpsInGrid = gridDim.x * warpsInBlock; + int C_per_iteration = warpSize * x128::size; + int iterations_C = C / C_per_iteration; // the first half of shared memory is bias, second is weight float* dbias_shared = shared; @@ -849,56 +874,85 @@ __global__ void layernorm_backward_kernel7(floatX* dinp, floatX* dweight, floatX // first: two reduce operations float dnorm_mean = 0.0f; float dnorm_norm_mean = 0.0f; - for (int i = warpThreadIdx; i < C; i += warpSize) { - float norm_bti = ((float)inp_bt[i] - mean_bt) * rstd_bt; - float dnorm_i = (float)weight[i] * (float)dout_bt[i]; - dnorm_mean += dnorm_i; - dnorm_norm_mean += dnorm_i * norm_bti; + for (int i = warpThreadIdx * x128::size; i < C; i += warpSize * x128::size) { + x128 dout128_i = load128(dout_bt + i); + x128 inp128_i = load128(inp_bt + i); + x128 weight128_i = load128(weight + i); + for (int k = 0; k < x128::size; k++) { + float norm_bti = ((float)inp128_i[k] - mean_bt) * rstd_bt; + float dnorm_i = (float)weight128_i[k] * (float)dout128_i[k]; + dnorm_mean += dnorm_i; + dnorm_norm_mean += dnorm_i * norm_bti; + } } dnorm_mean = warpReduceSum(dnorm_mean) / C; dnorm_norm_mean = warpReduceSum(dnorm_norm_mean) / C; // now iterate again and accumulate all the gradients - // todo - use x128 for this loop to improve performance - for (int i = warpThreadIdx; i < C; i += warpSize) { - float dout_i = (float)__ldcs(&dout_bt[i]); - float norm_bti = ((float)__ldcs(&inp_bt[i]) - mean_bt) * rstd_bt; - float dnorm_i = (float)weight[i] * dout_i; - // gradient contribution to bias - atomicAdd(&dbias_shared[i], dout_i); - // gradient contribution to weight - atomicAdd(&dweight_shared[i], norm_bti * dout_i); - // gradient contribution to input - float dval = 0.0f; - dval += dnorm_i; // term 1 - dval -= dnorm_mean; // term 2 - dval -= norm_bti * dnorm_norm_mean; // term 3 - dval *= rstd_bt; // final scale - dinp_bt[i] = (floatX)((float)dinp_bt[i] + dval); + // unfortunately we cannot use the same index for x128 arrays and shared memory + // as atomics can only be 32-bit rather than 128-bit (at least pre-SM90/Hopper) + // so this would result in an 8-way bank conflict, and kill performance + // so instead, we use a shared memory friendly index, and reorder before the final write + for (int i = 0; i < iterations_C; i++) { + int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration); + int shared_index = warpThreadIdx + (i * C_per_iteration); + x128 dout128 = load128cs(dout_bt + global_index); + x128 inp128 = load128cs(inp_bt + global_index); + x128 dinp128 = load128(dinp_bt + global_index); + x128 weight128 = load128(weight + global_index); + + for (int x = 0; x < x128::size; x++) { + float dout_i = (float)dout128[x]; + float norm_bti = ((float)inp128[x] - mean_bt) * rstd_bt; + float dnorm_i = (float)weight128[x] * dout_i; + // gradient contribution to bias (using shared memory friendly index) + atomicAdd(&dbias_shared[shared_index + x*warpSize], dout_i); + // gradient contribution to weight (using shared memory friendly index) + atomicAdd(&dweight_shared[shared_index + x*warpSize], norm_bti * dout_i); + // gradient contribution to input + float dval = 0.0f; + dval += dnorm_i; // term 1 + dval -= dnorm_mean; // term 2 + dval -= norm_bti * dnorm_norm_mean; // term 3 + dval *= rstd_bt; // final scale + dinp128[x] = (floatX)((float)dinp128[x] + dval); + } + // cache in L2 as this is read by the next kernel, but bypass L1 to minimise thrashing + store128cg(dinp_bt + global_index, dinp128); } } - // Accumulate into a FP32 scratchpad // BF16 atomics are potentially much slower... and this is more precise! - // todo - could avoid the extra copy if floatX is FP32, fairly negligible though + // todo - could potentially avoid the extra copy if floatX is FP32, fairly negligible though __syncthreads(); float* scratch_dbias = scratch; float* scratch_dweight = scratch + C; unsigned int* scratchFlag = (unsigned int*)(scratch + (2 * C)); for(int i = threadIdx.x; i < C; i+= blockDim.x) { + // global atomics in the same "shared memory banking friendly" order atomicAdd(&scratch_dbias[i], dbias_shared[i]); atomicAdd(&scratch_dweight[i], dweight_shared[i]); } __syncthreads(); if (threadIdx.x == 0) { - *tmp_flag = atomicAdd(scratchFlag, 1); + *tmp_flag = atomicInc(scratchFlag, gridDim.x); } __syncthreads(); if (*tmp_flag == gridDim.x-1) { - for(int i = threadIdx.x; i < C; i+= blockDim.x) { - // todo - potentially do stochastic rounding here as well - dbias[i] = (floatX)scratch_dbias[i]; - dweight[i] = (floatX)scratch_dweight[i]; + for (int i = warpId; i < iterations_C; i += warpsInBlock) { + // reorder from atomic/shared memory-friendly index to real global memory index + // and convert from float/FP32 to floatX/BF16 for the final write + int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration); + int shared_index = warpThreadIdx + (i * C_per_iteration); + + x128 dbias128; + x128 dweight128; + for (int x = 0; x < x128::size; x++) { + dbias128[x] = (floatX)scratch_dbias[shared_index + x*warpSize]; + dweight128[x] = (floatX)scratch_dweight[shared_index + x*warpSize]; + } + store128(dbias + global_index, dbias128); + store128(dweight + global_index, dweight128); } } } @@ -982,21 +1036,35 @@ struct SoftmaxParams { float Offset; }; -__device__ SoftmaxParams prepare_softmax_blockwide(int idx, const floatX* inp, int V, int P) { +__device__ SoftmaxParams prepare_softmax_blockwide3(int idx, const floatX* inp, int V, int P) { // same but not float4 // one row of inp, i.e. inp[idx, :] of shape (V,) const floatX* x = inp + idx * P; float thread_maxval = -INFINITY; float thread_sumval = 0.0f; - // do the loop in reverse to maximise probability of L2 cache hits - // so even small L2s get some hits on the 2nd read of the same thread - for (int i = (V+x128::size-1)/x128::size + threadIdx.x - blockDim.x; i >= 0; i -= blockDim.x) { - x128 packed_x = load128(x + i * x128::size); // try to keep in cache until next read - for(int k = 0; k < packed_x.size; ++k) { - if (i*x128::size+k >= V) { // bounds checking against real V - continue; + int i = (V+x128::size-1)/x128::size + threadIdx.x - blockDim.x; + + // special-case loop to handle the unaligned elements at the end of the array + // this lets us skip the bounds check in the main loop below, which improves performance + while ((i+1)*x128::size > V) { + for(int k = 0; k < x128::size; ++k) { + if (i*x128::size+k >= V) { + break; // bounds checking against real V (rather than padded P) } + float v = (float)x[i*x128::size+k]; + float old_maxval = thread_maxval; + thread_maxval = fmaxf(thread_maxval, v); + thread_sumval *= expf((old_maxval - thread_maxval)); + thread_sumval += expf(v - thread_maxval); + } + i -= blockDim.x; + } + + // main loop for the bulk of the iterations (no bounds checking required!) + for (; i >= 0; i -= blockDim.x) { + x128 packed_x = load128(x + i * x128::size); // load and keep in cache until fused_classifier loop + for(int k = 0; k < x128::size; ++k) { float v = (float)packed_x[k]; float old_maxval = thread_maxval; thread_maxval = fmaxf(thread_maxval, v); @@ -1006,7 +1074,7 @@ __device__ SoftmaxParams prepare_softmax_blockwide(int idx, const floatX* inp, i } // Block Max Reduction -> Maths -> Block Sum Reduction - float block_maxval = blockReduce(thread_maxval); + float block_maxval = blockReduce(thread_maxval, false, -INFINITY); thread_sumval *= expf(thread_maxval - block_maxval); float block_sumval = blockReduce(thread_sumval); @@ -1014,16 +1082,19 @@ __device__ SoftmaxParams prepare_softmax_blockwide(int idx, const floatX* inp, i return SoftmaxParams{1.f / block_sumval, block_maxval}; } -// same as 2 but not using float4 (see dev/cuda/classifier_fused.cu) // will _update_ logits to logit gradients -__global__ void fused_classifier_kernel3(floatX* logits, floatX* losses, floatX* probs, +// uses template to decide whether to write logits and probs +// split both loops in "multiple-of-x128-size" and "bounds-checked remainder" parts +template +__global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS) + fused_classifier_kernel5(floatX* logits, floatX* losses, floatX* probs, const floatX* dlosses, const int* targets, int B, int T, int V, int P) { int idx = gridDim.x - (blockIdx.x+1); // reverse order for cache hits on matmul data int ix = targets[idx]; // softmax (reading B * T * V, same logits read again below, hopefully still in cache) - SoftmaxParams sp = prepare_softmax_blockwide(idx, logits, V, P); + SoftmaxParams sp = prepare_softmax_blockwide3(idx, logits, V, P); // calculate the probability needed for the loss and update (single-threaded) if(threadIdx.x == 0) { @@ -1036,28 +1107,41 @@ __global__ void fused_classifier_kernel3(floatX* logits, floatX* losses, floatX* // calculate the gradients directly, saves bandwidth from probs during training // but also supports writing probs for inference-only and debugging const floatX* logits_vec = logits + idx * P; - for (int i = threadIdx.x; i < (V+x128::size-1)/x128::size; i += blockDim.x) { + int i = threadIdx.x; + for (; i < V/x128::size; i += blockDim.x) { // this is the 2nd read of logits after the one in prepare_softmax2 - // this data will never be needed again, so we reduce cache persistence - x128 packed_logits_vec = load128cs(logits_vec + i * x128::size); // load and do not keep in cache + // it will be overwritten by the logits gradients which is when we reduce cache persistence + x128 packed_logits_vec = load128(logits_vec + i * x128::size); // rely on cs of store128cs x128 packed_probs; - x128 packed_logits; - for(int k = 0; k < packed_logits_vec.size; ++k) { - int element = i*packed_logits_vec.size + k; - if (element >= V) { // bounds checking against real V - continue; - } - float v = (float)packed_logits_vec[k]; - float prob = expf(v - sp.Offset) * sp.Scale; + for(int k = 0; k < x128::size; ++k) { + int element = i*x128::size + k; + float prob = expf((float)packed_logits_vec[k] - sp.Offset) * sp.Scale; packed_probs[k] = (floatX)prob; float indicator = (element == ix) ? 1.0f : 0.0f; - packed_logits[k] = (floatX)((prob - indicator) * dloss); + packed_logits_vec[k] = (floatX)((prob - indicator) * dloss); } - if (logits != NULL){ - store128(logits + idx * P + i * packed_logits_vec.size, packed_logits); + if (WriteLogits){ + // reduce cache persistence for the overwritten logits + // to maximise probability that logits remain in cache between prepare_softmax and here + store128cs(logits + idx * P + i * x128::size, packed_logits_vec); } - if (probs != NULL) { - store128(probs + idx * P + i * packed_logits_vec.size, packed_probs); + if (WriteProbs) { + store128(probs + idx * P + i * x128::size, packed_probs); + } + } + + // handle remaining elements after the last multiple of x128::size + // e.g. if V = 8003, and x128::size = 8, we need to handle the last 3 elements + i *= x128::size; + for (; i < V; i++) { + float prob = expf((float)logits_vec[i] - sp.Offset) * sp.Scale; + float indicator = (i == ix) ? 1.0f : 0.0f; + float dlogit = (prob - indicator) * dloss; + if (WriteLogits){ + __stcs(logits + idx * P + i, (floatX)dlogit); + } + if (WriteProbs) { + probs[idx * P + i] = (floatX)prob; } } } @@ -1286,9 +1370,10 @@ void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias, / (block_size * grid_size_x)); // full GPU! assert((OC % OC_per_warp) == 0); // there is no bounds checking in the kernel to maximise performance + assert(block_size_y >= x128::size); // part of the kernel assumes this is large enough to avoid loops cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float), main_stream); - matmul_backward_bias_kernel6<<>>(dbias_buffer, dout, B, T, OC); cast_and_add_kernel<<>>(dbias, dbias_buffer, OC); @@ -1310,14 +1395,15 @@ void layernorm_backward(floatX* dinp, floatX* dweight, floatX* dbias, float* scr int B, int T, int C) { NVTX_RANGE_FN(); const int block_size = 1024; - const int grid_size = deviceProp.multiProcessorCount; + const int grid_size = MAX_1024_THREADS_BLOCKS * deviceProp.multiProcessorCount; size_t shared_mem_size = (2 * C + 1) * sizeof(float); cudaMemsetAsync(scratch, 0, (2 * C + 1) * sizeof(float), main_stream); - layernorm_backward_kernel7<<>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C); + layernorm_backward_kernel8<<>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C); cudaCheck(cudaGetLastError()); } + // the sequence of transformations in this compound op is: // inp (B,T,3C) -> qkvr (B,T,3C) -> preatt (B,NH,T,T) -> att (B,NH,T,T) -> vaccum (B,T,C) -> out (B,T,C) void attention_backward(floatX* dinp, floatX* dqkvr, floatX* dpreatt, floatX* datt, floatX* scratch, @@ -1370,14 +1456,14 @@ void attention_backward(floatX* dinp, floatX* dqkvr, floatX* dpreatt, floatX* da // replaces logits with logit gradients template -void fused_classifier3(Type* logits, Type* losses, +void fused_classifier(Type* logits, Type* losses, const Type* dlosses, const int* targets, int B, int T, int V, int P) { NVTX_RANGE_FN(); const int block_size = 1024; const int N = B * T; const int grid_size = N; - fused_classifier_kernel3<<>>(logits, losses, (Type*)NULL, dlosses, targets, B, T, V, P); + fused_classifier_kernel5<<>>(logits, losses, (Type*)NULL, dlosses, targets, B, T, V, P); cudaCheck(cudaGetLastError()); } @@ -1840,7 +1926,7 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo cudaStreamWaitEvent(main_stream, parallel_events[0], 0); // fused classifier: does the forward pass and first part of the backward pass // we're passing dlosses = NULL, which will default them to 1.0f/(B*T), i.e. uniform loss - fused_classifier3(acts.output, model->cpu_losses, (floatX*)NULL, model->targets, B, T, V, Vp); + fused_classifier(acts.output, model->cpu_losses, (floatX*)NULL, model->targets, B, T, V, Vp); // the GPU now writes the losses directly to the CPU buffer allocated with cudaMallocHost() // we accumulate cpu_losses at the end of gpt2_backward() waiting on this event From 1ea7f9bf2595cec705f8d28d685e669a46d58f8e Mon Sep 17 00:00:00 2001 From: ademeure Date: Tue, 7 May 2024 00:54:50 +0100 Subject: [PATCH 031/172] tiny irrelevant optimisation to final unaligned fused_classifier loop + add missing common.h changes --- dev/cuda/classifier_fused.cu | 7 +++---- dev/cuda/common.h | 16 +++++++++++++--- train_gpt2.cu | 15 +++++++-------- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/dev/cuda/classifier_fused.cu b/dev/cuda/classifier_fused.cu index 522ac5135..55f0d44cb 100644 --- a/dev/cuda/classifier_fused.cu +++ b/dev/cuda/classifier_fused.cu @@ -595,8 +595,7 @@ __global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS) // calculate the gradients directly, saves bandwidth from probs during training // but also supports writing probs for inference-only and debugging const floatX* logits_vec = logits + idx * P; - int i = threadIdx.x; - for (; i < V/x128::size; i += blockDim.x) { + for (int i = threadIdx.x; i < V/x128::size; i += blockDim.x) { // this is the 2nd read of logits after the one in prepare_softmax2 // it will be overwritten by the logits gradients which is when we reduce cache persistence x128 packed_logits_vec = load128(logits_vec + i * x128::size); // rely on cs of store128cs @@ -620,8 +619,8 @@ __global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS) // handle remaining elements after the last multiple of x128::size // e.g. if V = 8003, and x128::size = 8, we need to handle the last 3 elements - i *= x128::size; - for (; i < V; i++) { + int unaligned_start = V & ~(x128::size - 1); // round down to multiple of x128::size + for (int i = threadIdx.x + unaligned_start; i < V; i++) { float prob = expf((float)logits_vec[i] - sp.Offset) * sp.Scale; float indicator = (i == ix) ? 1.0f : 0.0f; float dlogit = (prob - indicator) * dloss; diff --git a/dev/cuda/common.h b/dev/cuda/common.h index 77e012fcd..63d0e1de1 100644 --- a/dev/cuda/common.h +++ b/dev/cuda/common.h @@ -48,6 +48,14 @@ int cuda_arch_minor = 0; int cuda_num_SMs = 0; // for persistent threads where we want 1 threadblock per SM int cuda_threads_per_SM = 0; // needed to calculate how many blocks to launch to fill up the GPU +// ---------------------------------------------------------------------------- +// to make sure that 2 blocks fit on A100/H100 to maximise latency tolerance +#if __CUDA_ARCH__ == 800 || __CUDA_ARCH__ >= 900 +#define MAX_1024_THREADS_BLOCKS 2 +#else +#define MAX_1024_THREADS_BLOCKS 1 +#endif + // ---------------------------------------------------------------------------- // Packed128 data structure, which forces the compiler to use 128-bit loads/stores // in GPUs that support (the LDG.128 and STS.128 instructions) @@ -88,24 +96,26 @@ template __device__ Packed128 load128(const ElementType* address) { return Packed128{*reinterpret_cast(address)}; } - // load a Packed128 from an aligned memory address with streaming cache hint template __device__ Packed128 load128cs(const ElementType* address) { return Packed128{__ldcs(reinterpret_cast(address))}; } - // store a Packed128 to an aligned memory address template __device__ void store128(ElementType* target, Packed128 value) { *reinterpret_cast(target) = value.get_bits(); } - // store a Packed128 to an aligned memory address with streaming cache hint template __device__ void store128cs(ElementType* target, Packed128 value) { __stcs(reinterpret_cast(target), value.get_bits()); } +// store a Packed128 to an aligned memory address while caching in L2 but bypassing L1 +template +__device__ void store128cg(ElementType* target, Packed128 value) { + __stcg(reinterpret_cast(target), value.get_bits()); +} // ---------------------------------------------------------------------------- // random utils diff --git a/train_gpt2.cu b/train_gpt2.cu index c64cf7199..3a339d56d 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -806,12 +806,12 @@ __global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, i accumulators[k] = 0.0f; } int thread_id = threadIdx.y * block_size_x + threadIdx.x; - for (int i = thread_id; i < OC_per_warp; i += block_size) { - shared[i] = 0.0f; + for (int idx = thread_id; idx < OC_per_warp; idx += block_size) { + shared[idx] = 0.0f; } __syncthreads(); - for (int i = blockIdx.y*block_size_y + threadIdx.y; i < B * T; i += gridDim.y*block_size_y) { - x128 packed_dout = load128(dout + global_oc + i*OC); + for (int idx = blockIdx.y*block_size_y + threadIdx.y; idx < B * T; idx += gridDim.y*block_size_y) { + x128 packed_dout = load128(dout + global_oc + idx*OC); for (int k = 0; k < x128::size; k++) { accumulators[k] += (float)packed_dout[k]; } @@ -1107,8 +1107,7 @@ __global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS) // calculate the gradients directly, saves bandwidth from probs during training // but also supports writing probs for inference-only and debugging const floatX* logits_vec = logits + idx * P; - int i = threadIdx.x; - for (; i < V/x128::size; i += blockDim.x) { + for (int i = threadIdx.x; i < V/x128::size; i += blockDim.x) { // this is the 2nd read of logits after the one in prepare_softmax2 // it will be overwritten by the logits gradients which is when we reduce cache persistence x128 packed_logits_vec = load128(logits_vec + i * x128::size); // rely on cs of store128cs @@ -1132,8 +1131,8 @@ __global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS) // handle remaining elements after the last multiple of x128::size // e.g. if V = 8003, and x128::size = 8, we need to handle the last 3 elements - i *= x128::size; - for (; i < V; i++) { + int unaligned_start = V & ~(x128::size - 1); // round down to multiple of x128::size + for (int i = threadIdx.x + unaligned_start; i < V; i++) { float prob = expf((float)logits_vec[i] - sp.Offset) * sp.Scale; float indicator = (i == ix) ? 1.0f : 0.0f; float dlogit = (prob - indicator) * dloss; From e1f89b304378b1a5aa6ebfa4d829cefe4c3cfd72 Mon Sep 17 00:00:00 2001 From: ademeure Date: Tue, 7 May 2024 00:57:01 +0100 Subject: [PATCH 032/172] remove BF16 default from classified_fused before PR --- dev/cuda/classifier_fused.cu | 4 ---- 1 file changed, 4 deletions(-) diff --git a/dev/cuda/classifier_fused.cu b/dev/cuda/classifier_fused.cu index 55f0d44cb..c44727f73 100644 --- a/dev/cuda/classifier_fused.cu +++ b/dev/cuda/classifier_fused.cu @@ -23,10 +23,6 @@ nvcc -O3 --use_fast_math -lcublas -lcublasLt classifier_fused.cu -o classifier_f // todo - this file does not properly support anything but FP32 // kernel 5 can be run in fp16/bf16 to test performance, but the outputs will be wrong -#undef ENABLE_BF16 -#undef ENABLE_FP16 -#define ENABLE_BF16 - #if defined(ENABLE_BF16) typedef __nv_bfloat16 floatX; #elif defined(ENABLE_FP16) From 3cc16f135412b0269cc6b369078fd50bdd4ebc89 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 7 May 2024 00:39:25 +0000 Subject: [PATCH 033/172] fix logits bug --- train_gpt2.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/train_gpt2.py b/train_gpt2.py index 1446e6165..7fceaff21 100644 --- a/train_gpt2.py +++ b/train_gpt2.py @@ -128,7 +128,7 @@ def __init__(self, config): self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying - def forward(self, idx, targets=None): + def forward(self, idx, targets=None, return_logits=True): device = idx.device b, t = idx.size() assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" @@ -147,11 +147,16 @@ def forward(self, idx, targets=None): # if we are given some desired targets also calculate the loss logits = self.lm_head(x) loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1) - return None, loss else: # inference-time mini-optimization: only forward the lm_head on the very last position logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim - return logits, None + loss = None + + # there are performance reasons why not returning logits is prudent, if not needed + if not return_logits: + logits = None + + return logits, loss @classmethod def from_pretrained(cls, model_type): @@ -538,8 +543,7 @@ def get_batch(): for i in range(args.num_iterations): t0 = time.time() with ctx: - logits, loss = model(x, y) - del logits + _, loss = model(x, y, return_logits=False) if not args.inference_only: optimizer.zero_grad(set_to_none=True) loss.backward() From 25507542152175543b356e1bdc1e39dffb7c2c68 Mon Sep 17 00:00:00 2001 From: Ross Wheeler Date: Mon, 6 May 2024 17:41:19 -0700 Subject: [PATCH 034/172] Change FLT_MAX to flt_max --- train_gpt2.cu | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index 7a73cb612..d5014d8db 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -54,7 +54,6 @@ This reads & runs in fp32, B=4, T=64, LR=1e-4, val/sample never (200), #include "utils.h" // defines: tokenizer_init, tokenizer_decode, tokenizer_free #include "tokenizer.h" -#undef FLT_MAX // ---------------------------------------------------------------------------- // CUDA precision settings @@ -683,8 +682,8 @@ __global__ void softmax_forward_kernel5(floatX* out, float inv_temperature, cons const floatX* x = inp + idx * T; // not INF, so we don't get NaNs accidentally when subtracting two values. - const float FLT_MAX = 340282346638528859811704183484516925440.0f; // to avoid including float.h - float maxval = -FLT_MAX; + const float flt_max = 340282346638528859811704183484516925440.0f; // to avoid including float.h + float maxval = -flt_max; float sumval = 0.0f; const floatX* x_aligned = reinterpret_cast(__builtin_assume_aligned(x, 16)); From c26124085296b9763b781a0a625f089d22398b0c Mon Sep 17 00:00:00 2001 From: ademeure Date: Tue, 7 May 2024 03:43:27 +0100 Subject: [PATCH 035/172] 3 x 512 threads max for layernorm_backward to avoid cache thrashing (hacky -> better way?) --- train_gpt2.cu | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index 3a339d56d..8893e7660 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -835,7 +835,7 @@ __global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, i atomicAdd(dbias + i + blockIdx.x*OC_per_warp, shared[i]); } -__global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS) +__global__ void __launch_bounds__(512, 3) // todo - any warnings on Turing with only 1024 threads? layernorm_backward_kernel8(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch, const floatX* dout, const floatX* inp, const floatX* weight, const floatX* mean, const floatX* rstd, @@ -1393,8 +1393,11 @@ void layernorm_backward(floatX* dinp, floatX* dweight, floatX* dbias, float* scr const floatX* dout, const floatX* inp, const floatX* weight, const floatX* mean, const floatX* rstd, int B, int T, int C) { NVTX_RANGE_FN(); - const int block_size = 1024; - const int grid_size = MAX_1024_THREADS_BLOCKS * deviceProp.multiProcessorCount; + // todo - forcing 3 x 512 threads per SM maximum is a bit hacky, but more than that results in + // cache thrashing and lower performance on A100... is there a better way? + const int block_size = 512; + const int blocks_per_sm = min(3, (deviceProp.maxThreadsPerMultiProcessor / 1024)); + const int grid_size = blocks_per_sm * deviceProp.multiProcessorCount; size_t shared_mem_size = (2 * C + 1) * sizeof(float); cudaMemsetAsync(scratch, 0, (2 * C + 1) * sizeof(float), main_stream); From b3e8a9fe6758302c76887de16870ca74c55b6401 Mon Sep 17 00:00:00 2001 From: KarhouTam Date: Wed, 8 May 2024 09:22:14 +0800 Subject: [PATCH 036/172] Implementation of online softmax forward kernel without cgs. --- dev/cuda/softmax_forward.cu | 92 +++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/dev/cuda/softmax_forward.cu b/dev/cuda/softmax_forward.cu index f611864f0..891c0cb85 100644 --- a/dev/cuda/softmax_forward.cu +++ b/dev/cuda/softmax_forward.cu @@ -514,6 +514,88 @@ __global__ void softmax_forward_kernel7(float* out, const float* inp, int N, int } } +__global__ void softmax_forward_online_kernel8(float* out, const float* inp, int N, int C) { + // do the same job as softmax_forward_online_kernel1() + // further combines unrolling, shared memory and warp reduce utilities in CUDA + extern __shared__ float shared[]; + const int UNROLL_FACTOR = 8; + // FAKE_MAXVAL is set empirically to mimic the maxval that would appear in real situation + const float FAKE_MAXVAL = 10.f; + const int warpsPerBlock = blockDim.x / warpSize; + int idx = blockIdx.x; + int tid = threadIdx.x; + int laneId = tid % warpSize; + int warpId = tid / warpSize; + float* maxvals = shared; + float* sumvals = &shared[warpsPerBlock]; + + if (tid >= C) { + maxvals[warpId] = -INFINITY; + sumvals[warpId] = 0.0f; + return; + } + + const float* x = inp + idx * C; + float* y = out + idx * C; + + // each thread computes partial maxval and sumval in range [::blockDim.x] + // after finished this part, each thread in block-0 holds partial maxval and sumval + float maxval = -INFINITY, sumval = 0.0f; + for (int i = tid; i < C; i += blockDim.x * UNROLL_FACTOR) { + #pragma unroll + for (int j = 0; j < UNROLL_FACTOR && i + j * blockDim.x < C; ++j) { + maxval = fmaxf(maxval, x[i + j * blockDim.x]); + // using FAKE_MAXVAL to avoid keeping updating inter-maxvals + sumval += expf(x[i + j * blockDim.x] - FAKE_MAXVAL); + } + } + sumval *= expf(FAKE_MAXVAL - maxval); + + // computes sumval and maxval of each warp (32 threads) + // after finished this part, shared memory holds maxval and sumval of each warp + + float offset_maxval, offset_sumval, tmp_maxval; + for (int offset = warpSize / 2; offset > 0; offset >>= 1) { + offset_maxval = __shfl_down_sync(0xFFFFFFFF, maxval, offset); + offset_sumval = __shfl_down_sync(0xFFFFFFFF, sumval, offset); + tmp_maxval = fmaxf(maxval, offset_maxval); + sumval = sumval * expf(maxval - tmp_maxval) + + offset_sumval * expf(offset_maxval - tmp_maxval); + maxval = tmp_maxval; + } + if (laneId == 0) { + sumvals[warpId] = sumval; + maxvals[warpId] = maxval; + } + __syncthreads(); + + // computes the global maxval and sumval of row `idx` + if (tid < warpsPerBlock / 2) { + #pragma unroll + for (int offset = warpsPerBlock / 2; offset > 0; offset >>= 1) { + if (tid < offset) { + tmp_maxval = fmaxf(maxvals[tid], maxvals[tid + offset]); + sumvals[tid] = sumvals[tid] * expf(maxvals[tid] - tmp_maxval) + + sumvals[tid + offset] * + expf(maxvals[tid + offset] - tmp_maxval); + maxvals[tid] = tmp_maxval; + } + } + } + __syncthreads(); + + // write the final results into `out` + maxval = maxvals[0]; + float sum = sumvals[0]; + for (int i = tid; i < C; i += blockDim.x * UNROLL_FACTOR) { + #pragma unroll + for (int j = 0; j < UNROLL_FACTOR && i + j * blockDim.x < C; ++j) { + // __stcs(&y[i + j * blockDim.x], expf(x[i + j * blockDim.x] - maxval) / sum); + y[i + j * blockDim.x] = expf(x[i + j * blockDim.x] - maxval) / sum; + } + } +} + // ---------------------------------------------------------------------------- // kernel launcher @@ -560,6 +642,13 @@ void softmax_forward7(float* out, const float* inp, int N, int C, int block_size softmax_forward_kernel7<<>>(out, inp, N, C); } +void softmax_forward_online8(float* out, const float* inp, int N, int C, int block_size) { + const int grid_size = N; + size_t shared_mem_size = 2 * block_size / 32 * sizeof(float); + softmax_forward_online_kernel8<<>>(out, inp, N, C); + cudaCheck(cudaGetLastError()); +} + // kernel version dispatch void softmax_forward(int kernel_num, float* out, const float* inp, int N, int C, const int block_size) { switch (kernel_num) { @@ -584,6 +673,9 @@ void softmax_forward(int kernel_num, float* out, const float* inp, int N, int C, case 7: softmax_forward7(out, inp, N, C, block_size); break; + case 8: + softmax_forward_online8(out, inp, N, C, block_size); + break; default: printf("Invalid kernel number\n"); exit(1); From dc901d420bdabb849cb573f60d355abd6eb348f7 Mon Sep 17 00:00:00 2001 From: chinthysl Date: Wed, 8 May 2024 04:50:24 +0000 Subject: [PATCH 037/172] set correct gpu using multigpu config --- train_gpt2.cu | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index 16ff756ce..15d915835 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -428,6 +428,7 @@ MultiGpuConfig multi_gpu_config_init(int *argc, char ***argv) { return result; #else printf("Multi-GPU support is disabled. Using a single GPU.\n"); + cudaCheck(cudaSetDevice(0)); MultiGpuConfig result; result.process_rank = 0; result.num_processes = 1; @@ -2181,12 +2182,10 @@ void gpt2_free(GPT2 *model) { // ---------------------------------------------------------------------------- // common init & free code for train/test/profile void common_start(bool override_enable_tf32 = true, bool print_device_info = true) { - int deviceIdx = 0; - cudaCheck(cudaSetDevice(deviceIdx)); - cudaGetDeviceProperties(&deviceProp, deviceIdx); + cudaGetDeviceProperties(&deviceProp, multi_gpu_config.local_device_idx); if (print_device_info) { printf("[System]\n"); - printf("Device %d: %s\n", deviceIdx, deviceProp.name); + printf("Device %d: %s\n", multi_gpu_config.local_device_idx, deviceProp.name); } cudaCheck(cudaStreamCreate(&main_stream)); From 2356be7333323dcd170ff4d8d4ad94d5c50b87d8 Mon Sep 17 00:00:00 2001 From: chinthysl Date: Wed, 8 May 2024 05:02:23 +0000 Subject: [PATCH 038/172] set stream to main_stream in ncclAllReduce --- train_gpt2.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index 15d915835..95938d7e0 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -2128,8 +2128,7 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) { model->num_parameters, ncclFloatX, ncclAvg, multi_gpu_config->nccl_comm, - // use 0 for default stream (always implicitly synchronised) - /*stream=*/0)); + main_stream)); #endif } From 6a52d8619e6b3ee200d1a07d0c837c919f263486 Mon Sep 17 00:00:00 2001 From: KarhouTam Date: Wed, 8 May 2024 16:12:39 +0800 Subject: [PATCH 039/172] Optimize codes and comments --- dev/cuda/softmax_forward.cu | 107 ++++++++++++++---------------------- 1 file changed, 40 insertions(+), 67 deletions(-) diff --git a/dev/cuda/softmax_forward.cu b/dev/cuda/softmax_forward.cu index 891c0cb85..fb084dad8 100644 --- a/dev/cuda/softmax_forward.cu +++ b/dev/cuda/softmax_forward.cu @@ -515,84 +515,58 @@ __global__ void softmax_forward_kernel7(float* out, const float* inp, int N, int } __global__ void softmax_forward_online_kernel8(float* out, const float* inp, int N, int C) { - // do the same job as softmax_forward_online_kernel1() - // further combines unrolling, shared memory and warp reduce utilities in CUDA - extern __shared__ float shared[]; - const int UNROLL_FACTOR = 8; - // FAKE_MAXVAL is set empirically to mimic the maxval that would appear in real situation - const float FAKE_MAXVAL = 10.f; const int warpsPerBlock = blockDim.x / warpSize; - int idx = blockIdx.x; int tid = threadIdx.x; - int laneId = tid % warpSize; - int warpId = tid / warpSize; - float* maxvals = shared; - float* sumvals = &shared[warpsPerBlock]; if (tid >= C) { - maxvals[warpId] = -INFINITY; - sumvals[warpId] = 0.0f; return; } - - const float* x = inp + idx * C; - float* y = out + idx * C; - // each thread computes partial maxval and sumval in range [::blockDim.x] - // after finished this part, each thread in block-0 holds partial maxval and sumval - float maxval = -INFINITY, sumval = 0.0f; - for (int i = tid; i < C; i += blockDim.x * UNROLL_FACTOR) { - #pragma unroll - for (int j = 0; j < UNROLL_FACTOR && i + j * blockDim.x < C; ++j) { - maxval = fmaxf(maxval, x[i + j * blockDim.x]); - // using FAKE_MAXVAL to avoid keeping updating inter-maxvals - sumval += expf(x[i + j * blockDim.x] - FAKE_MAXVAL); - } - } - sumval *= expf(FAKE_MAXVAL - maxval); + int warpId = tid / warpSize; + int laneId = tid % warpSize; + int row = blockIdx.x * warpsPerBlock + warpId; - // computes sumval and maxval of each warp (32 threads) - // after finished this part, shared memory holds maxval and sumval of each warp - - float offset_maxval, offset_sumval, tmp_maxval; - for (int offset = warpSize / 2; offset > 0; offset >>= 1) { - offset_maxval = __shfl_down_sync(0xFFFFFFFF, maxval, offset); - offset_sumval = __shfl_down_sync(0xFFFFFFFF, sumval, offset); - tmp_maxval = fmaxf(maxval, offset_maxval); - sumval = sumval * expf(maxval - tmp_maxval) + - offset_sumval * expf(offset_maxval - tmp_maxval); - maxval = tmp_maxval; + if (row >= N) { + return; } - if (laneId == 0) { - sumvals[warpId] = sumval; - maxvals[warpId] = maxval; + + const float* x = inp + row * C; + float* const y = out + row * C; + + // merging calculating maxval and sumval in one loop + // which is an arithmetic improvment from online softmax over normal softmax + float maxval = -INFINITY, sumval = 0.0f, bigger; + for (int i = laneId; i < C; i += warpSize) { + // when updating the maxval, dynamically updates the previous sumval by + // multiplying e^{previous_maxval - current_maxval} + bigger = fmaxf(maxval, x[i]); + sumval = sumval * expf(maxval - bigger) + expf(x[i] - bigger); + maxval = bigger; } - __syncthreads(); - // computes the global maxval and sumval of row `idx` - if (tid < warpsPerBlock / 2) { - #pragma unroll - for (int offset = warpsPerBlock / 2; offset > 0; offset >>= 1) { - if (tid < offset) { - tmp_maxval = fmaxf(maxvals[tid], maxvals[tid + offset]); - sumvals[tid] = sumvals[tid] * expf(maxvals[tid] - tmp_maxval) + - sumvals[tid + offset] * - expf(maxvals[tid + offset] - tmp_maxval); - maxvals[tid] = tmp_maxval; - } + // using warp functions instead of cooperative groups for better readibility + // calculate the warp wised maxval and sumval + float offsetMaxval, offsetSumval; + for (int offset = warpSize / 2; offset > 0; offset >>= 1) { + __syncwarp(); + offsetMaxval = __shfl_down_sync(0xFFFFFFFF, maxval, offset); + offsetSumval = __shfl_down_sync(0xFFFFFFFF, sumval, offset); + if (offsetMaxval > maxval) { + sumval *= expf(maxval - offsetMaxval); + maxval = offsetMaxval; + } else { + offsetSumval *= expf(offsetMaxval - maxval); } + sumval += offsetSumval; } - __syncthreads(); - // write the final results into `out` - maxval = maxvals[0]; - float sum = sumvals[0]; - for (int i = tid; i < C; i += blockDim.x * UNROLL_FACTOR) { - #pragma unroll - for (int j = 0; j < UNROLL_FACTOR && i + j * blockDim.x < C; ++j) { - // __stcs(&y[i + j * blockDim.x], expf(x[i + j * blockDim.x] - maxval) / sum); - y[i + j * blockDim.x] = expf(x[i + j * blockDim.x] - maxval) / sum; - } + // retrive the warp wised maxval and sumval + // which are also the maxval and sumval of one row in C + maxval = __shfl_sync(0xFFFFFFFF, maxval, 0); + sumval = __shfl_sync(0xFFFFFFFF, sumval, 0); + + for (int i = laneId; i < C; i += warpSize) { + y[i] = expf(x[i] - maxval) / sumval; } } @@ -643,9 +617,8 @@ void softmax_forward7(float* out, const float* inp, int N, int C, int block_size } void softmax_forward_online8(float* out, const float* inp, int N, int C, int block_size) { - const int grid_size = N; - size_t shared_mem_size = 2 * block_size / 32 * sizeof(float); - softmax_forward_online_kernel8<<>>(out, inp, N, C); + const int grid_size = ceil_div(N * 32, block_size); + softmax_forward_online_kernel8<<>>(out, inp, N, C); cudaCheck(cudaGetLastError()); } From 6d7a99cb4e70464228b13b6d80996484fe1c38ad Mon Sep 17 00:00:00 2001 From: KarhouTam Date: Wed, 8 May 2024 17:45:14 +0800 Subject: [PATCH 040/172] Adjust comments --- dev/cuda/softmax_forward.cu | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/dev/cuda/softmax_forward.cu b/dev/cuda/softmax_forward.cu index fb084dad8..b487b956d 100644 --- a/dev/cuda/softmax_forward.cu +++ b/dev/cuda/softmax_forward.cu @@ -515,6 +515,9 @@ __global__ void softmax_forward_kernel7(float* out, const float* inp, int N, int } __global__ void softmax_forward_online_kernel8(float* out, const float* inp, int N, int C) { + // online softmax paper: http://arxiv.org/abs/1805.02867 + // online softmax reduces loops from 3 to 2 + // which is done by calculating sumval and maxval in one loop const int warpsPerBlock = blockDim.x / warpSize; int tid = threadIdx.x; @@ -524,6 +527,7 @@ __global__ void softmax_forward_online_kernel8(float* out, const float* inp, int int warpId = tid / warpSize; int laneId = tid % warpSize; + // one warp one row int row = blockIdx.x * warpsPerBlock + warpId; if (row >= N) { @@ -533,7 +537,7 @@ __global__ void softmax_forward_online_kernel8(float* out, const float* inp, int const float* x = inp + row * C; float* const y = out + row * C; - // merging calculating maxval and sumval in one loop + // merge calculating maxval and sumval in one loop // which is an arithmetic improvment from online softmax over normal softmax float maxval = -INFINITY, sumval = 0.0f, bigger; for (int i = laneId; i < C; i += warpSize) { @@ -544,7 +548,7 @@ __global__ void softmax_forward_online_kernel8(float* out, const float* inp, int maxval = bigger; } - // using warp functions instead of cooperative groups for better readibility + // use warp functions instead of cooperative groups for better readibility // calculate the warp wised maxval and sumval float offsetMaxval, offsetSumval; for (int offset = warpSize / 2; offset > 0; offset >>= 1) { @@ -560,7 +564,7 @@ __global__ void softmax_forward_online_kernel8(float* out, const float* inp, int sumval += offsetSumval; } - // retrive the warp wised maxval and sumval + // sync the warp wised maxval and sumval // which are also the maxval and sumval of one row in C maxval = __shfl_sync(0xFFFFFFFF, maxval, 0); sumval = __shfl_sync(0xFFFFFFFF, sumval, 0); From 99e5c5d2835ef25286f2ccf05914970e603de776 Mon Sep 17 00:00:00 2001 From: Paul Maragakis Date: Wed, 8 May 2024 15:17:25 -0400 Subject: [PATCH 041/172] Allow the code to train gpt2-xl This commit is the fix by adameure described in the comment below: https://github.com/karpathy/llm.c/pull/382#issuecomment-2100895501 To reproduce the following bug: https://github.com/karpathy/llm.c/pull/382#issuecomment-2100648148 First switch to gpt2-xl for tinystories with the following three commands: python prepro_tinystories.py # download tinystories sed -i 's/from_pretrained("gpt2")/from_pretrained("gpt2-xl")/' train_gpt2.py # select gpt2-xl python train_gpt2.py --input_bin data/TinyStories_train.bin --batch_size 1 # Prep the large model binaries Now you can compile and try the code with or without the fix. On an H100 machine I use the following command: train_gpt2cu -i data/TinyStories -b 10 -x 100 # gets 230k/s on one H100 node of 8 GPU --- train_gpt2.cu | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index 95938d7e0..0150ceba8 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -812,16 +812,18 @@ __global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, i shared[idx] = 0.0f; } __syncthreads(); - for (int idx = blockIdx.y*block_size_y + threadIdx.y; idx < B * T; idx += gridDim.y*block_size_y) { - x128 packed_dout = load128(dout + global_oc + idx*OC); - for (int k = 0; k < x128::size; k++) { - accumulators[k] += (float)packed_dout[k]; - } - } - // we need to avoid shared memory bank conflicts for the atomicAdd to maximise performance - // so we accumulate in a conflict-free order, then reorder to match the global memory order - for (int k = 0; k < x128::size; k++) { - atomicAdd(shared + threadIdx.x + (k * block_size_x), accumulators[k]); + if(global_oc < OC) { + for (int idx = blockIdx.y*block_size_y + threadIdx.y; idx < B * T; idx += gridDim.y*block_size_y) { + x128 packed_dout = load128(dout + global_oc + idx*OC); + for (int k = 0; k < x128::size; k++) { + accumulators[k] += (float)packed_dout[k]; + } + } + // we need to avoid shared memory bank conflicts for the atomicAdd to maximise performance + // so we accumulate in a conflict-free order, then reorder to match the global memory order + for (int k = 0; k < x128::size; k++) { + atomicAdd(shared + threadIdx.x + (k * block_size_x), accumulators[k]); + } } if (threadIdx.y >= x128::size) { return; } // only need this many warps to reorder the data __syncthreads(); @@ -834,7 +836,9 @@ __global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, i shared[local_oc + threadIdx.y] = tmp; __syncthreads(); // now we do a perfectly coalesced atomic add to global memory (1x 128-byte cacheline per warp) - atomicAdd(dbias + i + blockIdx.x*OC_per_warp, shared[i]); + if (i + blockIdx.x*OC_per_warp < OC) { + atomicAdd(dbias + i + blockIdx.x*OC_per_warp, shared[i]); + } } __global__ void __launch_bounds__(512, 3) // todo - any warnings on Turing with only 1024 threads? @@ -1366,11 +1370,10 @@ void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias, const int OC_per_warp = warp_size * x128::size; // 256 at BF16 const int block_size_x = 32; const int block_size_y = block_size / block_size_x; // 16 - const int grid_size_x = OC / OC_per_warp; // e.g. 3 horizontal blocks for 768 OCs at BF16 + const int grid_size_x = CEIL_DIV(OC, OC_per_warp); // e.g. 3 horizontal blocks for 768 OCs at BF16 const int grid_size_y = max(1, deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount / (block_size * grid_size_x)); // full GPU! - assert((OC % OC_per_warp) == 0); // there is no bounds checking in the kernel to maximise performance assert(block_size_y >= x128::size); // part of the kernel assumes this is large enough to avoid loops cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float), main_stream); From 0ce5fcf7e8be6d8865cfdc0884ebe05f9a31cfdd Mon Sep 17 00:00:00 2001 From: Anerudhan Date: Wed, 8 May 2024 19:53:48 +0000 Subject: [PATCH 042/172] Rename cudnn_att.cu to cudnn_att.cpp to speed up compilation. --- Makefile | 4 ++-- cudnn_att.cu => cudnn_att.cpp | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename cudnn_att.cu => cudnn_att.cpp (100%) diff --git a/Makefile b/Makefile index c4879588b..60e60b6a7 100644 --- a/Makefile +++ b/Makefile @@ -237,7 +237,7 @@ train_gpt2: train_gpt2.c test_gpt2: test_gpt2.c $(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $^ $(LDLIBS) $(OUTPUT_FILE) -$(NVCC_CUDNN): cudnn_att.cu +$(NVCC_CUDNN): cudnn_att.cpp $(NVCC) -c $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_INCLUDES) train_gpt2cu: train_gpt2.cu $(NVCC_CUDNN) @@ -256,4 +256,4 @@ profile_gpt2cu: profile_gpt2.cu $(NVCC_CUDNN) $(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) clean: - $(REMOVE_FILES) $(TARGETS) + $(REMOVE_FILES) $(TARGETS) $(NVCC_CUDNN) diff --git a/cudnn_att.cu b/cudnn_att.cpp similarity index 100% rename from cudnn_att.cu rename to cudnn_att.cpp From 84a2aa0dd08039493a7226c505bad919e594b870 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Wed, 8 May 2024 20:13:14 +0000 Subject: [PATCH 043/172] allow exporting all GPT-2 model sizes now, using --model flag in python script. also allow loading all the models from C, using the new flag -e, to point directly to the file to load. added some error handling for potentially common mistakes. note that loadin GPT-2 XL does not work yet and crashes, but I am imminently merging a fix as the next commit that resolves this --- train_gpt2.cu | 17 ++++++++++++++--- train_gpt2.py | 12 ++++++++---- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index 95938d7e0..4374682ec 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -69,7 +69,6 @@ enum PrecisionMode { typedef float floatX; #define CUBLAS_LOWP CUDA_R_32F #define PRECISION_MODE PRECISION_FP32 -const char* load_filename = "gpt2_124M.bin"; #ifdef MULTI_GPU const ncclDataType_t ncclFloatX = ncclFloat; #endif @@ -79,7 +78,6 @@ const ncclDataType_t ncclFloatX = ncclFloat; typedef half floatX; #define CUBLAS_LOWP CUDA_R_16F #define PRECISION_MODE PRECISION_FP16 -const char* load_filename = "gpt2_124M.bin"; #ifdef MULTI_GPU const ncclDataType_t ncclFloatX = ncclHalf; #endif @@ -88,7 +86,6 @@ const ncclDataType_t ncclFloatX = ncclHalf; typedef __nv_bfloat16 floatX; #define CUBLAS_LOWP CUDA_R_16BF #define PRECISION_MODE PRECISION_BF16 -const char* load_filename = "gpt2_124M_bf16.bin"; // bf16 weights specific filename #ifdef MULTI_GPU const ncclDataType_t ncclFloatX = ncclBfloat16; #endif @@ -1744,6 +1741,17 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) { fprintf(stderr, "---> HINT: try to re-run `python train_gpt2.py`\n"); exit(EXIT_FAILURE); } + if (PRECISION_MODE == PRECISION_BF16 && version != 5) { + fprintf(stderr, "Precision is configured as BF16 but model at %s is not.\n", checkpoint_path); + fprintf(stderr, "---> HINT: are you sure you're loading a _bf16.bin file?\n"); + exit(EXIT_FAILURE); + } + if (PRECISION_MODE == PRECISION_FP32 && version != 3) { + fprintf(stderr, "Precision is configured as FP32 but model at %s is not.\n", checkpoint_path); + fprintf(stderr, "---> HINT: to turn on FP32 you have to compile like: `make train_gpt2cu PRECISION=FP32`\n"); + fprintf(stderr, "---> HINT: are you sure you're loading a .bin file without any _bf16 in the name?\n"); + exit(EXIT_FAILURE); + } // read in hyperparameters model->config.max_seq_len = model_header[2]; @@ -2370,6 +2378,7 @@ void error_usage() { fprintf(stderr, "Example: ./train_gpt2cu -i data/TinyStories -v 100 -s 100 -g 144 -o stories.log\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " -i input dataset prefix (default = data/tiny_shakespeare)\n"); + fprintf(stderr, " -e input model filename (default = gpt2_124M_bf16.bin)\n"); fprintf(stderr, " -o output log file (default = NULL)\n"); fprintf(stderr, " -b batch size B (default = 4)\n"); fprintf(stderr, " -t sequence length T (default = 1024)\n"); @@ -2392,6 +2401,7 @@ int main(int argc, char *argv[]) { // read in the (optional) command line arguments const char* input_dataset_prefix = "data/tiny_shakespeare"; // or e.g. data/TinyStories + const char* load_filename = "gpt2_124M_bf16.bin"; // bf16 weights of the model const char* output_log_file = NULL; int B = 4; // batch size int T = 1024; // sequence length max @@ -2410,6 +2420,7 @@ int main(int argc, char *argv[]) { if (strlen(argv[i]) != 2) { error_usage(); } // must be -x (one dash, one letter) // read in the args if (argv[i][1] == 'i') { input_dataset_prefix = argv[i+1]; } + else if (argv[i][1] == 'e') { load_filename = argv[i+1]; } else if (argv[i][1] == 'o') { output_log_file = argv[i+1]; } else if (argv[i][1] == 'b') { B = atoi(argv[i+1]); } // Per-GPU batch size else if (argv[i][1] == 't') { T = atoi(argv[i+1]); } diff --git a/train_gpt2.py b/train_gpt2.py index 7fceaff21..80547b8f1 100644 --- a/train_gpt2.py +++ b/train_gpt2.py @@ -395,6 +395,7 @@ def print0(*args, **kwargs): # python train_gpt2.py --inference_only 1 --write_tensors 0 --sequence_length 1024 parser = argparse.ArgumentParser() parser.add_argument("--input_bin", type=str, default="data/tiny_shakespeare_val.bin", help="input .bin to train on") + parser.add_argument("--model", type=str, default="gpt2", help="gpt2|gpt2-medium|gpt2-large|gpt2-xl") parser.add_argument("--write_tensors", type=int, default=1, help="write tensors to disk") parser.add_argument("--inference_only", type=int, default=0, help="only run inference") parser.add_argument("--dtype", type=str, default="float32", help="float32|float16|bfloat16") @@ -409,6 +410,8 @@ def print0(*args, **kwargs): B, T = args.batch_size, args.sequence_length assert 1 <= T <= 1024 assert args.dtype in {"float32", "float16", "bfloat16"} + assert args.model in {"gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"} + model_to_size = {"gpt2": "124M", "gpt2-medium": "355M", "gpt2-large": "774M", "gpt2-xl": "1558M"} # set up DDP (distributed data parallel). torchrun sets this env variable ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run? @@ -469,7 +472,7 @@ def print0(*args, **kwargs): write_tokenizer(enc, "gpt2_tokenizer.bin") # load the GPT-2 model weights - model = GPT.from_pretrained("gpt2") + model = GPT.from_pretrained(args.model) model.train() model.to(device) if args.compile: @@ -519,11 +522,12 @@ def get_batch(): logits, loss = model(x, y) loss.backward() # save model params, in both float32 and bfloat16 - write_model(model, "gpt2_124M.bin", dtype="float32") - write_model(model, "gpt2_124M_bf16.bin", dtype="bfloat16") + model_size_str = model_to_size[args.model] # e.g. "124M" + write_model(model, f"gpt2_{model_size_str}.bin", dtype="float32") + write_model(model, f"gpt2_{model_size_str}_bf16.bin", dtype="bfloat16") # save x, y, logits, loss, and parameter gradients, for debugging C # always store these in fp32 to have an accurate reference (?) - write_state(model, x, y, logits, loss, "gpt2_124M_debug_state.bin") + write_state(model, x, y, logits, loss, f"gpt2_{model_size_str}_debug_state.bin") # ------------------------------------------------------------------------- # STAGE 2: training loop to get timings From 26dbbc75c29b618d0038c92776424124e57d0598 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Wed, 8 May 2024 20:18:33 +0000 Subject: [PATCH 044/172] unbreak the tests oops --- test_gpt2.cu | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test_gpt2.cu b/test_gpt2.cu index d7944125c..7613b6ba3 100644 --- a/test_gpt2.cu +++ b/test_gpt2.cu @@ -85,8 +85,16 @@ float* float_cpu_malloc_and_point_parameters(FloatParameterTensors* params, size int main(int argc, char *argv[]) { common_start(false, true); + // set the right paths + #if defined(ENABLE_BF16) + const char* load_filename = "gpt2_124M_bf16.bin"; + #else + const char* load_filename = "gpt2_124M.bin"; + #endif + // build the GPT-2 model from a checkpoint GPT2 model; + gpt2_build_from_checkpoint(&model, load_filename); size_t V = model.config.vocab_size; size_t Vp = model.config.padded_vocab_size; From 69aa64cc80410b590437eb5b41cb348b7da45254 Mon Sep 17 00:00:00 2001 From: Joe Halabi Date: Tue, 7 May 2024 17:18:03 -0700 Subject: [PATCH 045/172] Adds cuDNN install instructions for a smaller but sufficient package - Modifies README.md to provide example apt-get cuDNN install instructions that install the cuDNN dev package. "sudo apt-get install -y cudnn" will install the default cuDNN packages, but for a minimal setup, installing the dev package will see a 50% reduction in both, download size (~850MB to 425MB now) and local storage size (~2GB to ~1GB now). - Modifies the Makefile to point users to the README for cuDNN install instructions (through comments and the cuDNN install error message) - Modifies attention_forward.cu comments to point users to the README for cuDNN install instructions Signed-off-by: Vedaanta Agarwalla --- Makefile | 10 +++------- README.md | 13 +++++++++++-- dev/cuda/attention_forward.cu | 4 ++-- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index 60e60b6a7..eabb5646d 100644 --- a/Makefile +++ b/Makefile @@ -87,11 +87,7 @@ endif # Check and include cudnn if available # You can override the path to cudnn frontend by setting CUDNN_FRONTEND_PATH=your_path on the make command line -# You need cuDNN from: https://developer.nvidia.com/cudnn -# Follow the apt-get instructions or Windows instructions to install the cuDNN library -# And the cuDNN front-end from: https://github.com/NVIDIA/cudnn-frontend/tree/main -# For this there is no installation, just download the repo to your home directory or directory of your choice -# and then we include it below (see currently hard-coded path assumed in home directory) +# Refer to the README for cuDNN install instructions ifeq ($(USE_CUDNN), 1) ifeq ($(SHELL_UNAME), Linux) # hard-coded path for now in either . or ($HOME) directory @@ -103,7 +99,7 @@ ifeq ($(USE_CUDNN), 1) $(info ✓ cuDNN found, will run with flash-attention) CUDNN_FRONTEND_PATH ?= cudnn-frontend/include else - $(error ✗ cuDNN not found. See the Makefile for our currently hard-coded paths / install instructions) + $(error ✗ cuDNN not found. See the README for install instructions and the Makefile for hard-coded paths) endif NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH) NVCC_LDFLAGS += -lcudnn @@ -119,7 +115,7 @@ ifeq ($(USE_CUDNN), 1) else ifeq ($(shell if exist "cudnn-frontend\include" (echo exists)),exists) CUDNN_FRONTEND_PATH ?= cudnn-frontend\include #override on command line if different location else - $(error ✗ cuDNN not found. See the Makefile for our currently hard-coded paths / install instructions) + $(error ✗ cuDNN not found. See the README for install instructions and the Makefile for hard-coded paths) endif CUDNN_INCLUDE_PATH ?= -I"C:\Program Files\NVIDIA\CUDNN\v9.1\include\12.4" CUDNN_FRONTEND_PATH += $(CUDNN_INCLUDE_PATH) diff --git a/README.md b/README.md index dbb99e030..29d53689e 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ make train_gpt2cu ./train_gpt2cu ``` -If you additionally install cuDNN (see `Makefile` for instructions), you can also go faster with flash attention +If you additionally install cuDNN (see the CUDA section below), you can also go faster with flash attention ```bash make train_gpt2cu USE_CUDNN=1 @@ -256,7 +256,16 @@ If you have the latest CUDA you should expect this to compile OK, and you should make train_gpt2cu USE_CUDNN=1 ``` -This will try to compile with cudnn and run it. You have to have cuDNN installed on your system. Follow the [cuDNN installation instructions](https://developer.nvidia.com/cudnn) to install cuDNN with apt-get. On top of this you need the [cuDNN frontend](https://github.com/NVIDIA/cudnn-frontend/tree/main), but this is just header files. So simply download the repo to your disk, currently assumed to be in your home directory (i.e. the Makefile looks for `~/cudnn-frontend/include`). +This will try to compile with cudnn and run it. You have to have cuDNN installed on your system. The [cuDNN installation instructions](https://developer.nvidia.com/cudnn) with apt-get will grab the default set of cuDNN packages. For a minimal setup, the cuDNN dev package is sufficient, e.g. on Ubuntu 22.04 for CUDA 12.x: + +```bash +wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb +sudo dpkg -i cuda-keyring_1.1-1_all.deb +sudo apt-get update +sudo apt-get -y install libcudnn9-dev-cuda-12 +``` + +On top of this you need the [cuDNN frontend](https://github.com/NVIDIA/cudnn-frontend/tree/main), but this is just header files. So simply download the repo to your disk, currently assumed to be in your home directory (i.e. the Makefile looks for `~/cudnn-frontend/include`). **Multi-GPU training**. As of April 26, 2024 there is now also support for multi-GPU training using MPI and NCCL. Make sure you install MPI, e.g. on Linux: diff --git a/dev/cuda/attention_forward.cu b/dev/cuda/attention_forward.cu index a9325f085..8b6aaf61d 100644 --- a/dev/cuda/attention_forward.cu +++ b/dev/cuda/attention_forward.cu @@ -2,8 +2,8 @@ Kernels for attention forward pass. If you do not have CUDNN, you can remove ENABLE_CUDNN to run the other kernels -You need cuDNN from: https://developer.nvidia.com/cudnn -And the cuDNN front-end from: https://github.com/NVIDIA/cudnn-frontend/tree/main + +See the README for cuDNN install instructions Compile example with cuDNN: nvcc -I/PATH/TO/cudnn-frontend/include -DENABLE_CUDNN -O3 --use_fast_math -lcublas -lcudnn attention_forward.cu -o attention_forward From 99e765de8d7e743994268d32e64e192be5e3874f Mon Sep 17 00:00:00 2001 From: ngc92 <7938269+ngc92@users.noreply.github.com> Date: Thu, 9 May 2024 11:19:24 +0200 Subject: [PATCH 046/172] moved bf16 boilerplate to common.h --- dev/cuda/attention_forward.cu | 9 ++------ dev/cuda/common.h | 41 ++++++++++++++++++++++++++++++++++ dev/cuda/encoder_forward.cu | 16 +------------ dev/cuda/gelu_backward.cu | 16 +------------ dev/cuda/gelu_forward.cu | 16 +------------ dev/cuda/layernorm_backward.cu | 15 +------------ dev/cuda/residual_forward.cu | 15 +------------ 7 files changed, 48 insertions(+), 80 deletions(-) diff --git a/dev/cuda/attention_forward.cu b/dev/cuda/attention_forward.cu index a9325f085..66cfa629b 100644 --- a/dev/cuda/attention_forward.cu +++ b/dev/cuda/attention_forward.cu @@ -53,14 +53,9 @@ version 11 is kernel 10 skipping FP16/FP32 conversions (full FP16/BF16 network) #include #include #include -#include "common.h" -// ---------------------------------------------------------------------------- -// Floating point precision setup -typedef __nv_bfloat16 floatX; // half or __nv_bfloat16 (or float) -#define CUBLAS_LOWP CUDA_R_16BF // CUDA_R_16F or CUDA_R_16BF (or CUDA_R_32F) -// CUBLAS_COMPUTE_32F or CUBLAS_COMPUTE_16F (for CUDA_R_16F only, potentially slower?!) -#define CUBLAS_LOWP_COMPUTE CUBLAS_COMPUTE_32F +#define ENABLE_BF16 +#include "common.h" // ---------------------------------------------------------------------------- // CUDA & cuDNN setup diff --git a/dev/cuda/common.h b/dev/cuda/common.h index 63d0e1de1..788a8f505 100644 --- a/dev/cuda/common.h +++ b/dev/cuda/common.h @@ -117,6 +117,47 @@ __device__ void store128cg(ElementType* target, Packed128 value) { __stcg(reinterpret_cast(target), value.get_bits()); } +// ---------------------------------------------------------------------------- +// reduced/mixed precision utilities + +#if defined(ENABLE_BF16) + +typedef __nv_bfloat16 floatX; +typedef __nv_bfloat16 floatN; +#define CUBLAS_LOWP CUDA_R_16BF // CUDA_R_16F or CUDA_R_16BF (or CUDA_R_32F) +// CUBLAS_COMPUTE_32F or CUBLAS_COMPUTE_16F (for CUDA_R_16F only, potentially slower?!) +#define CUBLAS_LOWP_COMPUTE CUBLAS_COMPUTE_32F + +#elif defined(ENABLE_FP16) + +typedef half floatX; +typedef half floatN; + +#else + +typedef float floatX; +typedef float floatN; +#endif + +typedef Packed128 x128; + + +// older nvcc does not provide __ldcs and __stcs for bfloat16, despite these actually just being unsigned shorts. +// we need to be careful here to only define our own versions if none already exist, otherwise the compiler will +// complain. +// If not, you easily get "no viable overload" (for sm52) and "function already exists" (sm_80) +#if defined(ENABLE_BF16) && (__CUDACC_VER_MAJOR__ < 12) && !((__CUDA_ARCH__ >= 800) || !defined(__CUDA_ARCH__)) +__device__ floatX __ldcs(const floatX* address) { + unsigned short bf = __ldcs(reinterpret_cast(address)); + return __nv_bfloat16_raw{bf}; +} + +__device__ void __stcs(floatX* address, floatX value) { + __stcs(reinterpret_cast(address), ((__nv_bfloat16_raw)value).x); +} +#endif + + // ---------------------------------------------------------------------------- // random utils diff --git a/dev/cuda/encoder_forward.cu b/dev/cuda/encoder_forward.cu index 16df62f34..e901fd654 100644 --- a/dev/cuda/encoder_forward.cu +++ b/dev/cuda/encoder_forward.cu @@ -17,24 +17,10 @@ version 3 is like version 2 but uses float4 reads/writes #include #include #include -#include "common.h" #include -// turn on bf16 as default, done up here for now #define ENABLE_BF16 - -#if defined(ENABLE_BF16) -typedef __nv_bfloat16 floatX; -typedef __nv_bfloat16 floatN; -#elif defined(ENABLE_FP16) -typedef half floatX; -typedef half floatN; -#else -typedef float floatX; -typedef float floatN; -#endif - -typedef Packed128 x128; +#include "common.h" // ---------------------------------------------------------------------------- // CPU code reference diff --git a/dev/cuda/gelu_backward.cu b/dev/cuda/gelu_backward.cu index 8c64d7ca3..bbd81c4bc 100644 --- a/dev/cuda/gelu_backward.cu +++ b/dev/cuda/gelu_backward.cu @@ -19,23 +19,9 @@ version 2 uses the Packed128 data structure #include #include #include -#include "common.h" -// turn on bf16 as default, done up here for now #define ENABLE_BF16 - -#if defined(ENABLE_BF16) -typedef __nv_bfloat16 floatX; -typedef __nv_bfloat16 floatN; -#elif defined(ENABLE_FP16) -typedef half floatX; -typedef half floatN; -#else -typedef float floatX; -typedef float floatN; -#endif - -typedef Packed128 x128; +#include "common.h" // ---------------------------------------------------------------------------- // CPU code reference diff --git a/dev/cuda/gelu_forward.cu b/dev/cuda/gelu_forward.cu index 27aa9d598..e07ad663a 100644 --- a/dev/cuda/gelu_forward.cu +++ b/dev/cuda/gelu_forward.cu @@ -19,23 +19,9 @@ version 2 is bfloat16 with the Packed128 data structure #include #include #include -#include "common.h" -// turn on bf16 as default, done up here for now #define ENABLE_BF16 - -#if defined(ENABLE_BF16) -typedef __nv_bfloat16 floatX; -typedef __nv_bfloat16 floatN; -#elif defined(ENABLE_FP16) -typedef half floatX; -typedef half floatN; -#else -typedef float floatX; -typedef float floatN; -#endif - -typedef Packed128 x128; +#include "common.h" // ---------------------------------------------------------------------------- // CPU code reference diff --git a/dev/cuda/layernorm_backward.cu b/dev/cuda/layernorm_backward.cu index 1f432ba82..b3084e126 100644 --- a/dev/cuda/layernorm_backward.cu +++ b/dev/cuda/layernorm_backward.cu @@ -17,22 +17,9 @@ version 2 moves a lot of reduction to shared memory over global memory #include #include #include -#include "common.h" -// turn on bf16 as default, done up here for now #define ENABLE_BF16 - -#if defined(ENABLE_BF16) -typedef __nv_bfloat16 floatX; -typedef __nv_bfloat16 floatN; -#elif defined(ENABLE_FP16) -typedef half floatX; -typedef half floatN; -#else -typedef float floatX; -typedef float floatN; -#endif -typedef Packed128 x128; +#include "common.h" // ---------------------------------------------------------------------------- // CPU code reference diff --git a/dev/cuda/residual_forward.cu b/dev/cuda/residual_forward.cu index bbbcde270..f07871a29 100644 --- a/dev/cuda/residual_forward.cu +++ b/dev/cuda/residual_forward.cu @@ -13,23 +13,10 @@ version 2 packs input into 128 bit memory reads #include #include #include -#include "common.h" -// turn on bf16 as default, done up here for now #define ENABLE_BF16 +#include "common.h" -#if defined(ENABLE_BF16) -typedef __nv_bfloat16 floatX; -typedef __nv_bfloat16 floatN; -#elif defined(ENABLE_FP16) -typedef half floatX; -typedef half floatN; -#else -typedef float floatX; -typedef float floatN; -#endif - -typedef Packed128 x128; // ---------------------------------------------------------------------------- // CPU code reference lol From 01c7a331bbcefa08ecb5cac21e0b9dacb79ef53f Mon Sep 17 00:00:00 2001 From: Anerudhan Date: Thu, 9 May 2024 19:07:00 +0000 Subject: [PATCH 047/172] - Simplify graph cache and usage of cudnn. - Fix failures in H100 --- cudnn_att.cpp | 133 +++++++++++++++++++++++++++----------------------- 1 file changed, 71 insertions(+), 62 deletions(-) diff --git a/cudnn_att.cpp b/cudnn_att.cpp index fd9760b1a..04b1a92ec 100644 --- a/cudnn_att.cpp +++ b/cudnn_att.cpp @@ -60,38 +60,35 @@ static void checkCudnnFE(fe::error_object e, const char *file, int line) { } #define checkCudnnFE(err) checkCudnnFE(err, __FILE__, __LINE__) -using graph_tensors_fwd = std::tuple, - std::shared_ptr, // Q, - std::shared_ptr, // K, - std::shared_ptr, // V, - std::shared_ptr, // Attn_scale, - std::shared_ptr, // O - std::shared_ptr // Stats ->; - -using graph_tensors_bwd = std::tuple, - std::shared_ptr, // Q, - std::shared_ptr, // K, - std::shared_ptr, // V, - std::shared_ptr, // O - std::shared_ptr, // dO - std::shared_ptr, // Stats - std::shared_ptr, // Attn_scale, - std::shared_ptr, // dQ, - std::shared_ptr, // dK, - std::shared_ptr // dV ->; +enum UIDs { + Q_UID, + K_UID, + V_UID, + Attn_scale_UID, + O_UID, + Stats_UID, + dO_UID, + dQ_UID, + dK_UID, + dV_UID +}; // Need a cache because graph->build_operation_graph() is slow but everything else seems fast -using cache_type_fwd = std::unordered_map; -using cache_type_bwd = std::unordered_map; +using cache_type_fwd = std::map, std::shared_ptr>; +using cache_type_bwd = std::map, std::shared_ptr>; // Loosely based on cuDNN frontend samples functions and massively simplified -template -auto lookup_cache_or_build_graph_fwd(Args... args) { +auto lookup_cache_or_build_graph_fwd(int B,int H,int T,int HS, int is_inference_only) { + static cache_type_fwd user_maintained_cache_fwd; - auto [B, H, T, HS, is_inference_only] = std::make_tuple(args...); + auto key = std::make_tuple(B, H, T, HS, is_inference_only); + + auto it = user_maintained_cache_fwd.find(key); + if (it != user_maintained_cache_fwd.end()) { + return it->second; + } + auto graph = std::make_shared(); graph->set_io_data_type(CUDNN_16BIT) .set_intermediate_data_type(fe::DataType_t::FLOAT) @@ -100,16 +97,20 @@ auto lookup_cache_or_build_graph_fwd(Args... args) { // QKV is (B, T, 3, NH, HS) which cuDNN can handle directly without an external permute auto Q = graph->tensor(fe::graph::Tensor_attributes().set_name("Q") .set_dim({B, H, T, HS}) + .set_uid(Q_UID) .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1})); auto K = graph->tensor(fe::graph::Tensor_attributes().set_name("K") .set_dim({B, H, T, HS}) + .set_uid(K_UID) .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1})); auto V = graph->tensor(fe::graph::Tensor_attributes().set_name("V") .set_dim({B, H, T, HS}) + .set_uid(V_UID) .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1})); auto attn_scale = graph->tensor(fe::graph::Tensor_attributes().set_name("attn_scale") .set_dim({1, 1, 1, 1}) .set_stride({1, 1, 1, 1}) + .set_uid(Attn_scale_UID) .set_is_pass_by_value(true) .set_data_type(fe::DataType_t::FLOAT)); @@ -122,38 +123,47 @@ auto lookup_cache_or_build_graph_fwd(Args... args) { auto [O, stats] = graph->sdpa(Q, K, V, sdpa_options); // Output is (B, T, NH, HS) BF16/FP16 and stats for backward pass is (B, NH, T) FP32 - O->set_output(true).set_dim({B, H, T, HS}).set_stride({H * HS * T, HS, H * HS, 1}); + O->set_output(true).set_dim({B, H, T, HS}).set_stride({H * HS * T, HS, H * HS, 1}).set_uid(O_UID); assert(stats == nullptr || is_inference_only == false); if (is_inference_only == false) { stats->set_output(true).set_data_type(fe::DataType_t::FLOAT) .set_dim({B, H, T, 1}) - .set_stride({H * T, T, 1, 1}); + .set_stride({H * T, T, 1, 1}) + .set_uid(Stats_UID); } checkCudnnFE(graph->validate()); - auto key = graph->key(); - auto it = user_maintained_cache_fwd.find(key); - if (it != user_maintained_cache_fwd.end()) { - return it->second; - } // Build the operation graph and execution part (this is the VERY SLOW PART) checkCudnnFE(graph->build_operation_graph(cudnn_handle)); auto plans = graph->create_execution_plans({fe::HeurMode_t::A}); checkCudnnFE(graph->check_support(cudnn_handle)); checkCudnnFE(graph->build_plans(cudnn_handle)); - assert(graph->get_workspace_size() <= cudnn_workspace_size); // fwd shouldn't need workspace + // Reallocate the workspace if the required size is greater than the current workspace + // In H100 this may be around 16B + if (graph->get_workspace_size() > cudnn_workspace_size) { + if (cudnn_workspace_size > 0) { + cudaCheck(cudaFree(cudnn_workspace)); + } + cudnn_workspace_size = graph->get_workspace_size(); + cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size)); + } - auto tuple = std::make_tuple(graph, Q, K, V, attn_scale, O, stats); - user_maintained_cache_fwd.insert({key, tuple}); - return tuple; + user_maintained_cache_fwd.insert({key, graph}); + + return graph; } -template -auto lookup_cache_or_build_graph_bwd(Args... args) { +auto lookup_cache_or_build_graph_bwd(int B, int NH, int T, int HS) { static cache_type_bwd user_maintained_cache_bwd; - auto [B, NH, T, HS] = std::make_tuple(args...); + + auto key = std::make_tuple(B, NH, T, HS); + + auto it = user_maintained_cache_bwd.find(key); + if (it != user_maintained_cache_bwd.end()) { + return it->second; + } auto graph = std::make_shared(); graph->set_io_data_type(CUDNN_16BIT) @@ -164,28 +174,35 @@ auto lookup_cache_or_build_graph_bwd(Args... args) { // must come from inp (which means we also need to convert THAT to FP16) auto Q = graph->tensor(fe::graph::Tensor_attributes().set_name("Q") .set_dim({B, NH, T, HS}) + .set_uid(Q_UID) .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1})); auto K = graph->tensor(fe::graph::Tensor_attributes().set_name("K") .set_dim({B, NH, T, HS}) + .set_uid(K_UID) .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1})); auto V = graph->tensor(fe::graph::Tensor_attributes().set_name("V") .set_dim({B, NH, T, HS}) + .set_uid(V_UID) .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1})); auto O = graph->tensor(fe::graph::Tensor_attributes().set_name("O") .set_dim({B, NH, T, HS}) + .set_uid(O_UID) .set_stride({NH * HS * T, HS, NH * HS, 1})); auto dO = graph->tensor(fe::graph::Tensor_attributes().set_name("dO") .set_dim({B, NH, T, HS}) + .set_uid(dO_UID) .set_stride({NH * HS * T, HS, NH * HS, 1})); auto stats = graph->tensor(fe::graph::Tensor_attributes().set_name("stats") .set_dim({B, NH, T, 1}) + .set_uid(Stats_UID) .set_stride({NH * T, T, 1, 1}) .set_data_type(fe::DataType_t::FLOAT)); auto attn_scale = graph->tensor(fe::graph::Tensor_attributes().set_name("attn_scale") .set_dim({1, 1, 1, 1}) .set_stride({1, 1, 1, 1}) .set_is_pass_by_value(true) + .set_uid(Attn_scale_UID) .set_data_type(fe::DataType_t::FLOAT)); auto sdpa_backward_options = fe::graph::SDPA_backward_attributes().set_name("flash_attention_backward") .set_causal_mask(true) @@ -194,16 +211,11 @@ auto lookup_cache_or_build_graph_bwd(Args... args) { // Create the graph operation and get the output tensors back auto [dQ, dK, dV] = graph->sdpa_backward(Q, K, V, O, dO, stats, sdpa_backward_options); - dQ->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}); - dK->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}); - dV->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}); + dQ->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}).set_uid(dQ_UID); + dK->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}).set_uid(dK_UID); + dV->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}).set_uid(dV_UID); checkCudnnFE(graph->validate()); - auto key = graph->key(); - auto it = user_maintained_cache_bwd.find(key); - if (it != user_maintained_cache_bwd.end()) { - return it->second; - } // Build the operation graph and execution part (this is the VERY SLOW PART) checkCudnnFE(graph->build_operation_graph(cudnn_handle)); @@ -221,9 +233,8 @@ auto lookup_cache_or_build_graph_bwd(Args... args) { cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size)); } - auto tuple = std::make_tuple(graph, Q, K, V, O, dO, stats, attn_scale, dQ, dK, dV); - user_maintained_cache_bwd.insert({key, tuple}); - return tuple; + user_maintained_cache_bwd.insert({key, graph}); + return graph; } void attention_forward_cudnn(floatX* out, // output: (B, T, NH, HS) @@ -235,8 +246,7 @@ void attention_forward_cudnn(floatX* out, // output: (B, T, NH, HS) bool is_inference_only = (stats == nullptr); // Get graph and tensors from cache (or generate it on first use) - auto [graph, Q, K, V, attn_scale, O, softmax_stats] = - lookup_cache_or_build_graph_fwd(B, NH, T, HS, is_inference_only); + auto graph = lookup_cache_or_build_graph_fwd(B, NH, T, HS, is_inference_only); // Prepare all the tensor pointers for executing the graph void* devPtrQ = inp; @@ -246,12 +256,12 @@ void attention_forward_cudnn(floatX* out, // output: (B, T, NH, HS) void* devPtrO = out; // Build variant pack - std::unordered_map, void*> variant_pack = { - {Q, devPtrQ}, {K, devPtrK}, {V, devPtrV}, {attn_scale, &attn_scale_cpu}, {O, devPtrO}}; + std::unordered_map variant_pack = { + {Q_UID, devPtrQ}, {K_UID, devPtrK}, {V_UID, devPtrV}, {Attn_scale_UID, &attn_scale_cpu}, {O_UID, devPtrO}}; // Add the stats tensor unless we are only doing inference (only needed for backward pass) if (is_inference_only == false) { - variant_pack[softmax_stats] = stats; + variant_pack[Stats_UID] = stats; } // Execute graph @@ -266,8 +276,7 @@ void attention_backward_cudnn(floatX* dqkvr, int HS = C / NH; // number of features per head // Get graph and tensors from cache (or generate it on first use) - auto [graph, Q, K, V, O, dO, Stats, attn_scale, dQ, dK, dV] = - lookup_cache_or_build_graph_bwd(B, NH, T, HS); + auto graph = lookup_cache_or_build_graph_bwd(B, NH, T, HS); // Prepare all the tensor pointers for executing the graph void* devPtrQ = qkvr; @@ -283,10 +292,10 @@ void attention_backward_cudnn(floatX* dqkvr, void* devPtrdV = (dqkvr + 2 * NH * HS); // Build variant pack that links each tensor to its data pointer - std::unordered_map, void*> variant_pack = { - {Q, devPtrQ}, {K, devPtrK}, {V, devPtrV}, {O, devPtrO}, {dO, devPtrdO}, {Stats, devPtrStats}, - {dQ, devPtrdQ}, {dK, devPtrdK}, {dV, devPtrdV}, - {attn_scale, &attn_scale_cpu}}; + std::unordered_map variant_pack = { + {Q_UID, devPtrQ}, {K_UID, devPtrK}, {V_UID, devPtrV}, {O_UID, devPtrO}, {dO_UID, devPtrdO}, {Stats_UID, devPtrStats}, + {dQ_UID, devPtrdQ}, {dK_UID, devPtrdK}, {dV_UID, devPtrdV}, + {Attn_scale_UID, &attn_scale_cpu}}; // Execute graph checkCudnnFE(graph->execute(cudnn_handle, variant_pack, cudnn_workspace)); From 691c1df969bf893054731613f7e3ae9299aed86d Mon Sep 17 00:00:00 2001 From: ngc92 <7938269+ngc92@users.noreply.github.com> Date: Thu, 9 May 2024 23:48:11 +0200 Subject: [PATCH 048/172] fused layernorm+residual --- dev/cuda/Makefile | 3 +- dev/cuda/attention_forward.cu | 8 - dev/cuda/classifier_fused.cu | 8 - dev/cuda/common.h | 7 + dev/cuda/fused_residual_forward.cu | 695 +++++++++++++++++++++++++++++ dev/cuda/softmax_forward.cu | 8 - train_gpt2.cu | 145 +++++- 7 files changed, 833 insertions(+), 41 deletions(-) create mode 100644 dev/cuda/fused_residual_forward.cu diff --git a/dev/cuda/Makefile b/dev/cuda/Makefile index 834a98b0f..c74178851 100644 --- a/dev/cuda/Makefile +++ b/dev/cuda/Makefile @@ -18,7 +18,7 @@ MPI_PATHS = -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux- $(NVCC) $(CFLAGS) $(NVCCFLAGS) $< -o $@ # Build all targets -TARGETS = adamw attention_backward attention_forward classifier_fused crossentropy_forward crossentropy_softmax_backward encoder_backward encoder_forward gelu_backward gelu_forward layernorm_backward layernorm_forward matmul_backward matmul_backward_bias matmul_forward nccl_all_reduce residual_forward softmax_forward trimat_forward +TARGETS = adamw attention_backward attention_forward classifier_fused crossentropy_forward crossentropy_softmax_backward encoder_backward encoder_forward gelu_backward gelu_forward layernorm_backward layernorm_forward matmul_backward matmul_backward_bias matmul_forward nccl_all_reduce residual_forward softmax_forward trimat_forward fused_residual_forward all: $(TARGETS) # Individual targets: forward pass @@ -28,6 +28,7 @@ crossentropy_forward: crossentropy_forward.cu encoder_forward: encoder_forward.cu gelu_forward: gelu_forward.cu layernorm_forward: layernorm_forward.cu +fused_residual_forward: fused_residual_forward.cu residual_forward: residual_forward.cu softmax_forward: softmax_forward.cu trimat_forward: trimat_forward.cu diff --git a/dev/cuda/attention_forward.cu b/dev/cuda/attention_forward.cu index 66cfa629b..b22b3f132 100644 --- a/dev/cuda/attention_forward.cu +++ b/dev/cuda/attention_forward.cu @@ -240,14 +240,6 @@ __device__ float warpReduceMax(float val) { return val; } -// warp-level reduction for summing values -__device__ float warpReduceSum(float val) { - for (int offset = 16; offset > 0; offset /= 2) { - val += __shfl_down_sync(0xFFFFFFFF, val, offset); - } - return val; -} - __global__ void softmax_forward_kernel4(float* out, const float* inp, int N, int C) { // out is (N, C) just like inp. Each row of inp will get softmaxed. // same as kernel3, but can handle any block size (multiple of 32) diff --git a/dev/cuda/classifier_fused.cu b/dev/cuda/classifier_fused.cu index c44727f73..092de5955 100644 --- a/dev/cuda/classifier_fused.cu +++ b/dev/cuda/classifier_fused.cu @@ -98,14 +98,6 @@ void crossentropy_softmax_backward_cpu(float* dlogits, // ---------------------------------------------------- // Kernel Utils -// warp-level reduction for summing values -__device__ float warpReduceSum(float val) { - for (int offset = 16; offset > 0; offset /= 2) { - val += __shfl_xor_sync(0xFFFFFFFF, val, offset); - } - return val; -} - // warp-level reduction for finding the maximum value __device__ float warpReduceMax(float val) { for (int offset = 16; offset > 0; offset /= 2) { diff --git a/dev/cuda/common.h b/dev/cuda/common.h index 788a8f505..5da54fe1d 100644 --- a/dev/cuda/common.h +++ b/dev/cuda/common.h @@ -10,6 +10,13 @@ __host__ __device__ T ceil_div(T dividend, T divisor) { return (dividend + divisor-1) / divisor; } +__device__ float warpReduceSum(float val) { + for (int offset = 16; offset > 0; offset /= 2) { + val += __shfl_xor_sync(0xFFFFFFFF, val, offset); + } + return val; +} + // ---------------------------------------------------------------------------- // checking utils diff --git a/dev/cuda/fused_residual_forward.cu b/dev/cuda/fused_residual_forward.cu new file mode 100644 index 000000000..f228503af --- /dev/null +++ b/dev/cuda/fused_residual_forward.cu @@ -0,0 +1,695 @@ +/* +Kernels for residual forward pass fused with layernorm + +Compile example: +nvcc -O3 --use_fast_math fused_residual_forward.cu -o fused_residual_forward + +version 1 is naive port from CPU code to kernel +./fused_residual_forward 1 +version 2 packs input into 128 bit memory reads +./fused_residual_forward 2 +*/ + +#include +#include +#include "assert.h" +#include + +#define ENABLE_BF16 +#include "common.h" + +// ---------------------------------------------------------------------------- +// CPU code reference lol + +void residual_forward_cpu(float* out, const float* inp1, const float* inp2, int N) { + for (int i = 0; i < N; i++) { + out[i] = inp1[i] + inp2[i]; + } +} + +void layernorm_forward_cpu(float* out, float* mean, float* rstd, + const float* inp, const float* weight, const float* bias, + int B, int T, int C) { + float eps = 1e-5f; + for (int b = 0; b < B; b++) { + for (int t = 0; t < T; t++) { + // seek to the input position inp[b,t,:] + const float* x = inp + b * T * C + t * C; + // calculate the mean + float m = 0.0f; + for (int i = 0; i < C; i++) { + m += x[i]; + } + m = m/C; + // calculate the variance (without any bias correction) + float v = 0.0f; + for (int i = 0; i < C; i++) { + float xshift = x[i] - m; + v += xshift * xshift; + } + v = v/C; + // calculate the rstd + float s = 1.0f / sqrtf(v + eps); + // seek to the output position in out[b,t,:] + float* out_bt = out + b * T * C + t * C; + for (int i = 0; i < C; i++) { + float n = (s * (x[i] - m)); // normalized output + float o = n * weight[i] + bias[i]; // scale and shift it + out_bt[i] = o; // write + } + // cache the mean and rstd for the backward pass later + mean[b * T + t] = m; + rstd[b * T + t] = s; + } + } +} + +// ---------------------------------------------------------------------------- +// GPU kernels + +// elementwise ops are nice and ez +__global__ void residual_forward_kernel1(floatX* out, const floatX* inp1, const floatX* inp2, int N) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < N) { + out[idx] = (floatX)((float)inp1[idx] + (float)inp2[idx]); + } +} + +// naive drag and drop implementation into kernel, parallelize over B,T, loop over C +__global__ void layernorm_forward_kernel1(floatX* out, floatX* mean, floatX* rstd, + const floatX* inp, const floatX* weight, const floatX* bias, + int N, int C) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + float eps = 1e-5f; + + if (idx < N) { + // seek to the input position inp[idx,:] + const floatX* x = inp + idx * C; + // calculate the mean + float m = 0.0f; + for (int i = 0; i < C; i++) { + m += (float)x[i]; + } + m = m / C; + // calculate the variance (without any bias correction) + float v = 0.0f; + for (int i = 0; i < C; i++) { + float xshift = (float)x[i] - m; + v += xshift * xshift; + } + v = v / C; + // calculate the rstd + float s = 1.0f / sqrtf(v + eps); + // seek to the output position in out[idx,:] + floatX* out_idx = out + idx * C; + for (int i = 0; i < C; i++) { + float n = (s * ((float)x[i] - m)); // normalized output + float o = n * (float)weight[i] + (float)bias[i]; // scale and shift it + out_idx[i] = o; // write + } + // cache the mean and rstd for the backward pass later + mean[idx] = m; + rstd[idx] = s; + } +} + +// naive fusion; uncoalesced access pattern leads to terrible performance +__global__ void fused_residual_forward2(floatX* residual, floatX* normed, floatX* mean, floatX* rstd, + const floatX* inp1, const floatX* inp2, + const floatX* weight, const floatX* bias, + int N, int C) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if(idx > N) return; + + // adjust pointers to current token + residual += C * idx; + normed += C * idx; + inp1 += C * idx; + inp2 += C * idx; + + float eps = 1e-5f; + + float m = 0.0f; + for(int c = 0; c < C; ++c) { + float out = (float)inp1[c] + (float)inp2[c]; + m += out; + residual[c] = out; + } + + m = m / C; + float v = 0.0f; + for (int c = 0; c < C; c++) { + float xshift = (float)residual[c] - m; + v += xshift * xshift; + } + v = v / C; + + // calculate the rstd + float s = 1.0f / sqrtf(v + eps); + for (int c = 0; c < C; c++) { + float n = (s * ((float)residual[c] - m)); // normalized output + float o = n * (float)weight[c] + (float)bias[c]; // scale and shift it + normed[c] = o; // write + } + // cache the mean and rstd for the backward pass later + mean[idx] = m; + rstd[idx] = s; +} + +// handle one token per warp for coalesced access +__global__ void fused_residual_forward3(floatX* residual, floatX* normed, floatX* mean, floatX* rstd, + const floatX* inp1, const floatX* inp2, + const floatX* weight, const floatX* bias, + int N, int C) { + constexpr const int WarpSize = 32; + assert(blockDim.x == WarpSize); + int idx = blockIdx.x * blockDim.y + threadIdx.y; + if(idx > N) return; + + // adjust pointers to current token + residual += C * idx; + normed += C * idx; + inp1 += C * idx; + inp2 += C * idx; + + float eps = 1e-5f; + float m = 0.0f; + for(int c = threadIdx.x; c < C; c += WarpSize) { + float out = (float)inp1[c] + (float)inp2[c]; + m += out; + residual[c] = out; + } + + m = warpReduceSum(m); + + m = m / C; + float v = 0.0f; + for(int c = threadIdx.x; c < C; c += WarpSize) { + float xshift = (float)residual[c] - m; + v += xshift * xshift; + } + + v = warpReduceSum(v); + v = v / C; + + // calculate the rstd + float s = 1.0f / sqrtf(v + eps); + for(int c = threadIdx.x; c < C; c += WarpSize) { + float n = (s * ((float)residual[c] - m)); // normalized output + float o = n * (float)weight[c] + (float)bias[c]; // scale and shift it + normed[c] = o; // write + } + // cache the mean and rstd for the backward pass later + if(threadIdx.x == 0) { + mean[idx] = m; + rstd[idx] = s; + } +} + +// vectorized loading, single pass stats, streaming access and zigzag loop +__global__ void fused_residual_forward_kernel4(floatX* residual, floatX* normed, floatX* mean, floatX* rstd, + const floatX* inp1, const floatX* inp2, + const floatX* weight, const floatX* bias, + int N, int C) { + using x128 = Packed128; + constexpr const int WarpSize = 32; + assert(blockDim.x == WarpSize); + int idx = blockIdx.x * blockDim.y + threadIdx.y; + if(idx > N) return; + + // adjust pointers to current token + residual += C * idx; + normed += C * idx; + inp1 += C * idx; + inp2 += C * idx; + + const float eps = 1e-5f; + float sum = 0.0f; + float sum_sq = 0.0f; + int c = threadIdx.x * x128::size; + for(; c < C; c += WarpSize * x128::size) { + const x128 in1 = load128cs(inp1 + c); + const x128 in2 = load128cs(inp2 + c); + x128 out; + for(int k = 0; k < x128::size; ++k) { + out[k] = (float)in1[k] + (float)in2[k]; + sum += (float)out[k]; + sum_sq += (float)out[k] * (float)out[k]; + } + store128(residual + c, out); + } + + sum = warpReduceSum(sum); + sum_sq = warpReduceSum(sum_sq); + + float m = sum / C; + float v = sum_sq / C - m * m; + float s = rsqrtf(v + eps); + + c -= WarpSize * x128::size; + for(; c >= 0; c -= WarpSize * x128::size) { + const x128 res = load128cs(residual + c); + const x128 w = load128(weight + c); + const x128 b = load128(bias + c); + x128 out; + for(int k = 0; k < x128::size; ++k) { + float n = s * ((float)res[k] - m); // normalized output + float o = n * (float)w[k] + (float)b[k]; // scale and shift it + out[k] = o; + } + + store128cs(normed + c, out); + } + // cache the mean and rstd for the backward pass later + if(threadIdx.x == 0) { + mean[idx] = m; + rstd[idx] = s; + } +} + +// what do you want in shared memory? EVERYTHING! +// thus, we no longer require zigzag loops and can do the numerically more stable variance estimation +// needs special attention in the kernel launcher to ensure we have enough smem. +__global__ void fused_residual_forward_kernel5(floatX* residual, floatX* normed, floatX* mean, floatX* rstd, + const floatX* inp1, const floatX* inp2, + const floatX* weight, const floatX* bias, + int N, int C) { + constexpr const int WarpSize = 32; + assert(blockDim.x == WarpSize); + + // load weights and biases into shared memory + // do this before we allow any threads to exit! + extern __shared__ char params[]; + // load128/store128 sometimes generated multiple instructions when the types here were floatX*, so + // let's keep everything as x128 + x128* s_weight = reinterpret_cast(params); + x128* s_bias = reinterpret_cast(params) + (C / x128::size); + x128* s_res = reinterpret_cast(params) + ((2 + threadIdx.y) * C / x128::size); + + int sidx = (threadIdx.x + WarpSize * threadIdx.y) * x128::size; + for(int i = sidx; i < C; i += blockDim.y * WarpSize * x128::size) { + s_weight[i/x128::size] = load128(weight + i); + s_bias[i/x128::size] = load128(bias + i); + } + __syncthreads(); + + int idx = blockIdx.x * blockDim.y + threadIdx.y; + if(idx > N) return; + + // adjust pointers to current token + residual += C * idx; + normed += C * idx; + inp1 += C * idx; + inp2 += C * idx; + + const float eps = 1e-5f; + float sum = 0.0f; + for(int c = threadIdx.x * x128::size; c < C; c += WarpSize * x128::size) { + const x128 in1 = load128cs(inp1 + c); + const x128 in2 = load128cs(inp2 + c); + x128 out; + for(int k = 0; k < x128::size; ++k) { + out[k] = (float)in1[k] + (float)in2[k]; + sum += (float)out[k]; + } + store128cs(residual + c, out); + s_res[c / x128::size] = out; + } + + sum = warpReduceSum(sum); + float m = sum / C; + float v = 0.f; + + for(int c = threadIdx.x * x128::size; c < C; c += WarpSize * x128::size) { + const x128 res = s_res[c / x128::size]; + for(int k = 0; k < x128::size; ++k) { + v += ((float)res[k] - m) * ((float)res[k] - m); + } + } + + v = warpReduceSum(v) / C; + float s = rsqrtf(v + eps); + + for(int c = threadIdx.x * x128::size; c < C; c += WarpSize * x128::size) { + const x128 res = s_res[c / x128::size]; + const x128 w = s_weight[c / x128::size]; + const x128 b = s_bias[c / x128::size]; + x128 out; + for(int k = 0; k < x128::size; ++k) { + float n = s * ((float)res[k] - m); // normalized output + float o = n * (float)w[k] + (float)b[k]; // scale and shift it + out[k] = o; + } + + store128cs(normed + c, out); + } + // cache the mean and rstd for the backward pass later + if(threadIdx.x == 0) { + mean[idx] = m; + rstd[idx] = s; + } +} + + +// using multiple warps per token, and keep threads persistent, so we never have to reload weights and biases +// if we had one warp per token, though, this would require us to use a huge amount of shared memory. Therefore, +// we use multiple warps per token; but generally we cannot use the entire block, because that would give too +// little work per warp to be effective (each warp processes 256 bfloat16 elements, so for C=768 more than 3 warps +// will just mean idle). Therefore, we add a z dimension, where warps with different z handle different tokens. +// all this makes the launcher logic more complicated :( +__global__ void fused_residual_forward_kernel6(floatX* residual, floatX* normed, floatX* mean, floatX* rstd, + const floatX* inp1, const floatX* inp2, + const floatX* weight, const floatX* bias, + int N, int C) { + constexpr const int WarpSize = 32; + assert(blockDim.x == WarpSize); + + // load weights and biases into shared memory + // do this before we allow any threads to exit! + extern __shared__ char params[]; + // load128/store128 sometimes generated multiple instructions when the types here were floatX*, so + // let's keep everything as x128 + // weights and biases are shared among all tokens + x128* s_weight = reinterpret_cast(params); + x128* s_bias = reinterpret_cast(params + C * sizeof(floatX)); + // residual output (input to layernorm) is indpendent for each sub-block indicates by threadIdx.z + x128* s_res = reinterpret_cast(params + (2 + threadIdx.z) * C * sizeof(floatX) ); + // similarly, each sub-block needs its own reduction buffers + float* s_mean = reinterpret_cast(params + (2 + blockDim.z) * C * sizeof(floatX) + threadIdx.z * 32 * sizeof(float)); + float* s_var = reinterpret_cast(params + (2 + blockDim.z) * C * sizeof(floatX) + 32 * sizeof(float) * (blockDim.z + threadIdx.z)); + + int cidx = (threadIdx.x + WarpSize * threadIdx.y) * x128::size; + int step = blockDim.y * WarpSize * x128::size; + + for(int c = cidx; c < C; c += step) { + s_weight[c / x128::size] = load128(weight + c); + s_bias[c / x128::size] = load128(bias + c); + } + // the block-level reductions will cause sync before the first time we read these + // => no syncthreads needed here + + + // loop over all tokens + for(int tidx = blockIdx.x * blockDim.z + threadIdx.z; tidx < N; tidx += gridDim.x * blockDim.z) { + // adjust pointers to current token + floatX* residual_bt = residual + C * tidx; + floatX* normed_bt = normed + C * tidx; + const floatX* inp1_bt = inp1 + C * tidx; + const floatX* inp2_bt = inp2 + C * tidx; + + const float eps = 1e-5f; + float sum = 0.0f; + for (int c = cidx; c < C; c += step) { + const x128 in1 = load128cs(inp1_bt + c); + const x128 in2 = load128cs(inp2_bt + c); + x128 out; + for (int k = 0; k < x128::size; ++k) { + out[k] = (float) in1[k] + (float) in2[k]; + sum += (float) out[k]; + } + store128cs(residual_bt + c, out); + s_res[c / x128::size] = out; + } + sum = warpReduceSum(sum); + if(threadIdx.x == 0) { + s_mean[threadIdx.y] = sum; + } + __syncthreads(); + float m = warpReduceSum(threadIdx.x < blockDim.y ? s_mean[threadIdx.x] : 0.f) / C; + // normally, we'd syncthread here to make sure that no warp is already at the next + // iteration of the loop, messing with s_mean. The fact that we interleave s_mean and s_var means + // we don't need these additional syncs. + float v = 0.f; + + for (int c = cidx; c < C; c += step) { + const x128 res = s_res[c / x128::size]; + for (int k = 0; k < x128::size; ++k) { + v += ((float) res[k] - m) * ((float) res[k] - m); + } + } + + v = warpReduceSum(v); + if(threadIdx.x == 0) { + s_var[threadIdx.y] = v; + } + __syncthreads(); + v = warpReduceSum(threadIdx.x < blockDim.y ? s_var[threadIdx.x] : 0.f) / C; + float s = rsqrtf(v + eps); + + for (int c = cidx; c < C; c += step) { + const x128 res = s_res[c / x128::size]; + const x128 w = s_weight[c / x128::size]; + const x128 b = s_bias[c / x128::size]; + x128 out; + for (int k = 0; k < x128::size; ++k) { + float n = s * ((float) res[k] - m); // normalized output + float o = n * (float) w[k] + (float) b[k]; // scale and shift it + out[k] = o; + } + + store128(normed_bt + c, out); + } + // cache the mean and rstd for the backward pass later + if (threadIdx.x == 0 && threadIdx.y == 0) { + mean[tidx] = m; + rstd[tidx] = s; + } + } +} + + + +// ---------------------------------------------------------------------------- +// kernel launcher + +void fused_residual_forward1(floatX* residual, floatX* normed, floatX* mean, floatX* rstd, + const floatX* inp1, const floatX* inp2, + const floatX* weight, const floatX* bias, + int N, int C, const int block_size) { + const int grid_size_resid = ceil_div(N * C, block_size); + residual_forward_kernel1<<>>(residual, inp1, inp2, N*C); + cudaCheck(cudaGetLastError()); + const int grid_size_ln = ceil_div(N, block_size); + layernorm_forward_kernel1<<>>(normed, mean, rstd, residual, weight, bias, N, C); + cudaCheck(cudaGetLastError()); +} + +void fused_residual_forward2(floatX* residual, floatX* normed, floatX* mean, floatX* rstd, + const floatX* inp1, const floatX* inp2, + const floatX* weight, const floatX* bias, + int N, int C, const int block_size) { + const int grid_size = ceil_div(N, (int)(block_size)); + fused_residual_forward2<<>>(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C); + cudaCheck(cudaGetLastError()); +} + +void fused_residual_forward3(floatX* residual, floatX* normed, floatX* mean, floatX* rstd, + const floatX* inp1, const floatX* inp2, + const floatX* weight, const floatX* bias, + int N, int C, const int block_size) { + int block_y = block_size / 32; + const int grid_size = ceil_div(N, block_y); + fused_residual_forward3<<>>(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C); + cudaCheck(cudaGetLastError()); +} + +void fused_residual_forward4(floatX* residual, floatX* normed, floatX* mean, floatX* rstd, + const floatX* inp1, const floatX* inp2, + const floatX* weight, const floatX* bias, + int N, int C, const int block_size) { + int block_y = block_size / 32; + const int grid_size = ceil_div(N, block_y); + fused_residual_forward_kernel4<<>>(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C); + cudaCheck(cudaGetLastError()); +} + +void fused_residual_forward5(floatX* residual, floatX* normed, floatX* mean, floatX* rstd, + const floatX* inp1, const floatX* inp2, + const floatX* weight, const floatX* bias, + int N, int C, const int block_size) { + int block_y = block_size / 32; + const int grid_size = ceil_div(N, block_y); + size_t smem = (2 + block_y) * C * sizeof(floatX); + + // in order to use more than 48 KiB of smem, need to call cudaFuncSetAttribute + // this may fail, in which case we fall back to the smem free implementation. + cudaCheck(cudaGetLastError()); + auto status = cudaFuncSetAttribute(fused_residual_forward_kernel5, cudaFuncAttributeMaxDynamicSharedMemorySize, smem); + cudaGetLastError(); + if(status == cudaSuccess) { + fused_residual_forward_kernel5<<>>(residual, normed, mean, rstd, inp1, inp2, + weight, bias, N, C); + } else { + fused_residual_forward_kernel4<<>>(residual, normed, mean, rstd, inp1, inp2, + weight, bias, N, C); + } + cudaCheck(cudaGetLastError()); +} + +void fused_residual_forward6(floatX* residual, floatX* normed, floatX* mean, floatX* rstd, + const floatX* inp1, const floatX* inp2, + const floatX* weight, const floatX* bias, + int N, int C, const int block_size) { + int warps_per_token = max(1, C / Packed128::size / 32); + int total_warps = block_size / 32; + int block_z = max(1, total_warps / warps_per_token); + int block_y = max(1, total_warps / block_z); + size_t smem = (2 + block_z) * C * sizeof(floatX) + 64 * sizeof(float) * block_z; + + // in order to use more than 48 KiB of smem, need to call cudaFuncSetAttribute + // this may fail, in which case we fall back to the smem free implementation. + cudaCheck(cudaGetLastError()); + auto status = cudaFuncSetAttribute(fused_residual_forward_kernel6, cudaFuncAttributeMaxDynamicSharedMemorySize, smem); + cudaGetLastError(); + if(status == cudaSuccess) { + const int num_blocks = max(1, cuda_threads_per_SM * cuda_num_SMs / block_size); + fused_residual_forward_kernel6<<>>(residual, normed, mean, rstd, inp1, inp2, + weight, bias, N, C); + } else { + const int grid_size = ceil_div(N, total_warps); + fused_residual_forward_kernel4<<>>(residual, normed, mean, rstd, inp1, inp2, + weight, bias, N, C); + } + cudaCheck(cudaGetLastError()); +} + +// kernel version dispatch +void fused_residual_forward(int kernel_num, floatX* residual, floatX* normed, floatX* mean, floatX* rstd, + const floatX* inp1, const floatX* inp2, + const floatX* weight, const floatX* bias, + int N, int C, const int block_size) { + switch (kernel_num) { + case 1: + fused_residual_forward1(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C, block_size); + break; + case 2: + fused_residual_forward2(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C, block_size); + break; + case 3: + fused_residual_forward3(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C, block_size); + break; + case 4: + fused_residual_forward4(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C, block_size); + break; + case 5: + fused_residual_forward5(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C, block_size); + break; + case 6: + fused_residual_forward6(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C, block_size); + break; + default: + printf("Invalid kernel number\n"); + exit(1); + } +} + +// ---------------------------------------------------------------------------- + +int main(int argc, const char **argv) { + setup_main(); + + int B = 8; + int T = 1024; + int C = 768; + + // read kernel_num from command line + int kernel_num = 1; + if (argc > 1) { + kernel_num = atoi(argv[1]); + } + printf("Using kernel %d\n", kernel_num); + + // create host memory of random numbers + float* residual = (float*)malloc(B * T * C * sizeof(float)); + float* normed = (float*)malloc(B * T * C * sizeof(float)); + float* inp1 = make_random_float(B * T * C); + float* inp2 = make_random_float(B * T * C); + float* mean = (float*)malloc(B * T * sizeof(float)); + float* rstd = (float*)malloc(B * T * sizeof(float)); + float* weight = make_random_float(C); + float* bias = make_random_float(C); + + // move to GPU + floatX* d_residual; + floatX* d_normed; + floatX* d_inp1; + floatX* d_inp2; + floatX* d_mean; + floatX* d_rstd; + floatX* d_weight; + floatX* d_bias; + cudaCheck(cudaMalloc(&d_residual, B * T * C * sizeof(floatX))); + cudaCheck(cudaMalloc(&d_normed, B * T * C * sizeof(floatX))); + cudaCheck(cudaMalloc(&d_inp1, B * T * C * sizeof(floatX))); + cudaCheck(cudaMalloc(&d_inp2, B * T * C * sizeof(floatX))); + cudaCheck(cudaMalloc(&d_mean, B * T * sizeof(float))); + cudaCheck(cudaMalloc(&d_rstd, B * T * sizeof(float))); + cudaCheck(cudaMalloc(&d_weight, C * sizeof(float))); + cudaCheck(cudaMalloc(&d_bias, C * sizeof(float))); + cudaCheck(memcpy_convert(d_inp1, inp1, B * T * C)); + cudaCheck(memcpy_convert(d_inp2, inp2, B * T * C)); + cudaCheck(memcpy_convert(d_weight, weight, C)); + cudaCheck(memcpy_convert(d_bias, bias, C)); + + // first check the correctness of the kernel + residual_forward_cpu(residual, inp1, inp2, B * T * C); + layernorm_forward_cpu(normed, mean, rstd, residual, weight, bias, B, T, C); + + // time the kernel at different block sizes + int block_sizes[] = {32, 64, 128, 256, 512, 1024}; + + for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) { + int block_size = block_sizes[j]; + printf("Checking block size %d.\n", block_size); + cudaCheck(cudaMemset(d_residual, 0, B * T * C * sizeof(floatX))); + fused_residual_forward(kernel_num, d_residual, d_normed, d_mean, d_rstd, d_inp1, d_inp2, d_weight, d_bias, + B*T, C, block_size); + float tol = std::is_same_v ? 1e-5 : 5e-2; + validate_result(d_residual, residual, "residual", B * T * C, tol); + validate_result(d_mean, mean, "mean", B * T, tol); + validate_result(d_rstd, rstd, "rstd", B * T, tol); + validate_result(d_normed, normed, "normed", B * T * C, tol); + } + + printf("All results match. Starting benchmarks.\n\n"); + + for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) { + int block_size = block_sizes[j]; + + int repeat_times = 1000; + float elapsed_time = benchmark_kernel(repeat_times, fused_residual_forward, kernel_num, + d_residual, d_normed, d_mean, d_rstd, d_inp1, d_inp2, d_weight, d_bias, + B*T, C, block_size + ); + + // napkin math: estimate the memory bandwidth achieved + // for each (B,T,C) output element, we do 2 reads and 2 writes, plus 2 BT writes for mean/rstd + // and e.g. A100 40GB PCIe is advertised at 1,555GB/s + long memory_ops = B * T * (C * 4 + 2) * sizeof(floatX); + float memory_bandwidth = memory_ops / elapsed_time / 1e6; + float toks_per_msec = B * T / elapsed_time / 1e3; + + printf("block_size %4d | time %.4f ms | bandwidth %.2f GB/s | elements: %.2f ktok/ms\n", + block_size, elapsed_time, memory_bandwidth, toks_per_msec); + } + + // free memory + free(residual); + free(normed); + free(mean); + free(rstd); + free(weight); + free(bias); + free(inp1); + free(inp2); + cudaCheck(cudaFree(d_residual)); + cudaCheck(cudaFree(d_normed)); + cudaCheck(cudaFree(d_mean)); + cudaCheck(cudaFree(d_rstd)); + cudaCheck(cudaFree(d_weight)); + cudaCheck(cudaFree(d_bias)); + cudaCheck(cudaFree(d_inp1)); + cudaCheck(cudaFree(d_inp2)); + + return 0; +} diff --git a/dev/cuda/softmax_forward.cu b/dev/cuda/softmax_forward.cu index f611864f0..d0d38850d 100644 --- a/dev/cuda/softmax_forward.cu +++ b/dev/cuda/softmax_forward.cu @@ -182,14 +182,6 @@ __device__ float warpReduceMax(float val) { return val; } -// warp-level reduction for summing values -__device__ float warpReduceSum(float val) { - for (int offset = 16; offset > 0; offset /= 2) { - val += __shfl_down_sync(0xFFFFFFFF, val, offset); - } - return val; -} - __global__ void softmax_forward_kernel3(float* out, const float* inp, int N, int C) { // kernel must use block size of 32 extern __shared__ float shared[]; diff --git a/train_gpt2.cu b/train_gpt2.cu index 16ff756ce..fdc4968d2 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -596,6 +596,87 @@ __global__ void layernorm_forward_kernel3(floatX* __restrict__ out, floatX* __re } } +__global__ void fused_residual_forward_kernel5(floatX* residual, floatX* normed, floatX* mean, floatX* rstd, + const floatX* inp1, const floatX* inp2, + const floatX* weight, const floatX* bias, + int N, int C) { + constexpr const int WarpSize = 32; + assert(blockDim.x == WarpSize); + + // load weights and biases into shared memory + // do this before we allow any threads to exit! + extern __shared__ char* params[]; + // load128/store128 sometimes generated multiple instructions when the types here were floatX*, so + // let's keep everything as x128 + x128* s_weight = reinterpret_cast(params); + x128* s_bias = reinterpret_cast(params) + (C / x128::size); + x128* s_res = reinterpret_cast(params) + ((2 + threadIdx.y) * C / x128::size); + + int sidx = (threadIdx.x + WarpSize * threadIdx.y) * x128::size; + for(int i = sidx; i < C; i += blockDim.y * WarpSize * x128::size) { + s_weight[i/x128::size] = load128(weight + i); + s_bias[i/x128::size] = load128(bias + i); + } + __syncthreads(); + + int idx = blockIdx.x * blockDim.y + threadIdx.y; + if(idx > N) return; + + // adjust pointers to current token + residual += C * idx; + normed += C * idx; + inp1 += C * idx; + inp2 += C * idx; + + const float eps = 1e-5f; + float sum = 0.0f; + for(int c = threadIdx.x * x128::size; c < C; c += WarpSize * x128::size) { + const x128 in1 = load128cs(inp1 + c); + const x128 in2 = load128cs(inp2 + c); + x128 out; + for(int k = 0; k < x128::size; ++k) { + out[k] = (float)in1[k] + (float)in2[k]; + sum += (float)out[k]; + } + store128cs(residual + c, out); + s_res[c / x128::size] = out; + } + + sum = warpReduceSum(sum); + float m = sum / C; + float v = 0.f; + + for(int c = threadIdx.x * x128::size; c < C; c += WarpSize * x128::size) { + const x128 res = s_res[c / x128::size]; + for(int k = 0; k < x128::size; ++k) { + v += ((float)res[k] - m) * ((float)res[k] - m); + } + } + + v = warpReduceSum(v) / C; + float s = rsqrtf(v + eps); + + for(int c = threadIdx.x * x128::size; c < C; c += WarpSize * x128::size) { + const x128 res = s_res[c / x128::size]; + const x128 w = s_weight[c / x128::size]; + const x128 b = s_bias[c / x128::size]; + x128 out; + for(int k = 0; k < x128::size; ++k) { + float n = s * ((float)res[k] - m); // normalized output + float o = n * (float)w[k] + (float)b[k]; // scale and shift it + out[k] = o; + } + + store128cs(normed + c, out); + } + // cache the mean and rstd for the backward pass later + if(threadIdx.x == 0) { + mean[idx] = m; + rstd[idx] = s; + } +} + + // inputs floatX, outputs FP32 (for current FP32-only activation path for this WIP) __global__ void permute_kernel(floatX* q, floatX* k, floatX* v, const floatX* inp, @@ -736,7 +817,7 @@ __global__ void softmax_forward_kernel5(floatX* out, float inv_temperature, cons } } -__global__ void residual_forward_kernel(floatX* out, floatX* inp1, floatX* inp2, int N) { +__global__ void residual_forward_kernel(floatX* out, const floatX* inp1, const floatX* inp2, int N) { int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size; if (idx >= N) { return; } @@ -1184,7 +1265,7 @@ void encoder_backward(floatX* dwte, floatX* dwpe, } void layernorm_forward(floatX* out, floatX* mean, floatX* rstd, - floatX* inp, floatX* weight, floatX* bias, + floatX* inp, const floatX* weight, const floatX* bias, int B, int T, int C) { NVTX_RANGE_FN(); const int block_size = 512; @@ -1321,7 +1402,7 @@ void attention_forward(floatX* out, floatX* qkvr, floatX* att, cudaCheck(cudaGetLastError()); } -void residual_forward(floatX* out, floatX* inp1, floatX* inp2, int N) { +void residual_forward(floatX* out, const floatX* inp1, const floatX* inp2, int N) { NVTX_RANGE_FN(); const int block_size = 256; const int grid_size = CEIL_DIV(N, block_size * x128::size); @@ -1329,6 +1410,31 @@ void residual_forward(floatX* out, floatX* inp1, floatX* inp2, int N) { cudaCheck(cudaGetLastError()); } +void fused_residual_forward5(floatX* residual, floatX* normed, floatX* mean, floatX* rstd, + const floatX* inp1, const floatX* inp2, + const floatX* weight, const floatX* bias, + int N, int C) { + const int block_size = 256; + int block_y = block_size / 32; + const int grid_size = CEIL_DIV(N, block_y); + size_t smem = (2 + block_y) * C * sizeof(floatX); + + // in order to use more than 48 KiB of smem, need to call cudaFuncSetAttribute + // this may fail, in which case we fall back to the smem free implementation. + cudaCheck(cudaGetLastError()); + auto status = cudaFuncSetAttribute(fused_residual_forward_kernel5, cudaFuncAttributeMaxDynamicSharedMemorySize, smem); + cudaGetLastError(); + if(status == cudaSuccess) { + fused_residual_forward_kernel5<<>>(residual, normed, mean, rstd, inp1, inp2, + weight, bias, N, C); + } else { + residual_forward(residual, inp1, inp2, N*C); + layernorm_forward(normed, mean, rstd, residual, weight, bias, N, 1, C); + } + cudaCheck(cudaGetLastError()); +} + + void gelu_forward(floatX* out, const floatX* inp, int N) { NVTX_RANGE_FN(); const int block_size = 512; @@ -1855,17 +1961,17 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo // forward pass ParameterTensors params = model->params; // for brevity ActivationTensors acts = model->acts; - floatX* residual; encoder_forward(acts.encoded, model->inputs, params.wte, params.wpe, B, T, C); // encoding goes into residual[0] + // first layernorm isn't fused + layernorm_forward(acts.ln1, acts.ln1_mean, acts.ln1_rstd, acts.encoded, params.ln1w, params.ln1b, B, T, C); + for (int l = 0; l < L; l++) { NvtxRange layer_range("Layer", l); - residual = l == 0 ? acts.encoded : acts.residual3 + (l-1) * B * T * C; + floatX* residual = l == 0 ? acts.encoded : acts.residual3 + (l-1) * B * T * C; // get the pointers of the weights for this layer - floatX* l_ln1w = params.ln1w + l * C; - floatX* l_ln1b = params.ln1b + l * C; floatX* l_qkvw = params.qkvw + l * 3*C * C; floatX* l_qkvb = params.qkvb + l * 3*C; floatX* l_attprojw = params.attprojw + l * C * C; @@ -1879,8 +1985,6 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo // get the pointers of the activations for this layer floatX* l_ln1 = acts.ln1 + l * B * T * C; - floatX* l_ln1_mean = acts.ln1_mean + l * B * T; - floatX* l_ln1_rstd = acts.ln1_rstd + l * B * T; floatX* l_qkvr = acts.qkvr + l * B * T * 3*C; floatX* l_atty = acts.atty + l * B * T * C; floatX* l_attproj = acts.attproj + l * B * T * C; @@ -1894,8 +1998,6 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo floatX* l_residual3 = acts.residual3 + l * B * T * C; // now do the forward pass - layernorm_forward(l_ln1, l_ln1_mean, l_ln1_rstd, residual, l_ln1w, l_ln1b, B, T, C); - #ifdef ENABLE_CUDNN float* l_att = (float*)acts.att + l * B * NH * T; // cuDNN needs a smaller FP32 tensor matmul_forward_cublaslt(l_qkvr, l_ln1, l_qkvw, l_qkvb, B, T, C, 3*C); @@ -1910,16 +2012,27 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo #endif matmul_forward_cublaslt(l_attproj, l_atty, l_attprojw, l_attprojb, B, T, C, C); - residual_forward(l_residual2, residual, l_attproj, B*T*C); - layernorm_forward(l_ln2, l_ln2_mean, l_ln2_rstd, l_residual2, l_ln2w, l_ln2b, B, T, C); + fused_residual_forward5(l_residual2, l_ln2, l_ln2_mean, l_ln2_rstd, residual, l_attproj, l_ln2w, l_ln2b, B*T, C); matmul_forward_cublaslt(l_fch, l_ln2, l_fcw, l_fcb, B, T, C, 4*C); gelu_forward(l_fch_gelu, l_fch, B*T*4*C); matmul_forward_cublaslt(l_fcproj, l_fch_gelu, l_fcprojw, l_fcprojb, B, T, 4*C, C); - residual_forward(l_residual3, l_residual2, l_fcproj, B*T*C); + + // OK, fusion across blocks. + if(l+1 != L) { + floatX* l_ln1 = acts.ln1 + (l + 1) * B * T * C; + floatX* l_ln1_mean = acts.ln1_mean + (l + 1) * B * T; + floatX* l_ln1_rstd = acts.ln1_rstd + (l + 1) * B * T; + const floatX* l_ln1w = params.ln1w + (l + 1) * C; + const floatX* l_ln1b = params.ln1b + (l + 1) * C; + fused_residual_forward5(l_residual3, l_ln1, l_ln1_mean, l_ln1_rstd, l_residual2, l_fcproj, l_ln1w, l_ln1b, + B * T, C); + } else { + fused_residual_forward5(l_residual3, acts.lnf, acts.lnf_mean, acts.lnf_rstd, l_residual2, l_fcproj, + params.lnfw, params.lnfb, + B * T, C); + } } - residual = acts.residual3 + (L-1) * B * T * C; // last residual is in residual3 - layernorm_forward(acts.lnf, acts.lnf_mean, acts.lnf_rstd, residual, params.lnfw, params.lnfb, B, T, C); matmul_forward_cublaslt(acts.output, acts.lnf, params.wte, NULL, B, T, C, Vp); // also forward the cross-entropy loss function if we have the targets From 5920143d47ad4f7ecde47941e31ebff6bf0aea98 Mon Sep 17 00:00:00 2001 From: ngc92 <7938269+ngc92@users.noreply.github.com> Date: Fri, 10 May 2024 00:02:37 +0200 Subject: [PATCH 049/172] remove warning noise --- dev/cuda/common.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dev/cuda/common.h b/dev/cuda/common.h index 5da54fe1d..0c2079821 100644 --- a/dev/cuda/common.h +++ b/dev/cuda/common.h @@ -71,7 +71,9 @@ int cuda_threads_per_SM = 0; // needed to calculate how many blocks to launch template struct alignas(16) Packed128 { - __device__ Packed128() = default; + // Note: = default implicitly generates a __device__ function, but explicitly + // adding __device__ causes a lot of warnings. + Packed128() = default; __device__ explicit Packed128(int4 bits) { static_assert(sizeof(bits) == sizeof(payload), "Size mismatch."); memcpy(&payload, &bits, sizeof(bits)); From 75ec629f5215e6a91758dd0d0a14e2416150d1ce Mon Sep 17 00:00:00 2001 From: ngc92 <7938269+ngc92@users.noreply.github.com> Date: Fri, 10 May 2024 00:03:30 +0200 Subject: [PATCH 050/172] remove duplicate function --- dev/cuda/layernorm_backward.cu | 7 ------- 1 file changed, 7 deletions(-) diff --git a/dev/cuda/layernorm_backward.cu b/dev/cuda/layernorm_backward.cu index b3084e126..904a57e0c 100644 --- a/dev/cuda/layernorm_backward.cu +++ b/dev/cuda/layernorm_backward.cu @@ -113,13 +113,6 @@ void layernorm_backward_cpu(float* dinp, float* dweight, float* dbias, // GPU kernels // GPU helper functions for atomicAdd on smaller than 32-bit types -__device__ float warpReduceSum(float val) { - for (int offset = 16; offset > 0; offset /= 2) { - val += __shfl_xor_sync(0xFFFFFFFF, val, offset); - } - return val; -} - #ifdef ENABLE_BF16 __device__ void atomicAddX(__nv_bfloat16* addr, __nv_bfloat16 val) { uintptr_t ptr_val = reinterpret_cast(addr); From 8ccf2f97f8ed2c05318de0d915132d9fe2eee060 Mon Sep 17 00:00:00 2001 From: Yijun Yu Date: Fri, 10 May 2024 06:49:24 +0800 Subject: [PATCH 051/172] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 29d53689e..f87161261 100644 --- a/README.md +++ b/README.md @@ -362,6 +362,7 @@ Lastly, I will be a lot more sensitive to complexity in the root folder of the p - [llm.🔥](https://github.com/dorjeduck/llm.mojo) by @[dorjeduck](https://github.com/dorjeduck): a Mojo port of this project - Rust + - [llm.rs](https://github.com/yijunyu/llm.rs) by @[Yijun Yu](https://github.com/yijunyu): a Rust rewrite with the aim to have same performance - [llm.rs](https://github.com/ToJen/llm.rs) by @[ToJen](https://github.com/ToJen): a Rust port of this project - Zig From 5c90845f7a3e9e81d480edbc7db4fd58278d19fb Mon Sep 17 00:00:00 2001 From: lancer Date: Thu, 9 May 2024 22:04:02 -0700 Subject: [PATCH 052/172] update the -lcublas -lcublasLt flag in the comment --- dev/cuda/adamw.cu | 4 ++-- dev/cuda/attention_backward.cu | 2 +- dev/cuda/attention_forward.cu | 4 ++-- dev/cuda/crossentropy_forward.cu | 2 +- dev/cuda/crossentropy_softmax_backward.cu | 2 +- dev/cuda/encoder_backward.cu | 2 +- dev/cuda/encoder_forward.cu | 2 +- dev/cuda/fused_residual_forward.cu | 2 +- dev/cuda/gelu_backward.cu | 2 +- dev/cuda/gelu_forward.cu | 2 +- dev/cuda/layernorm_backward.cu | 2 +- dev/cuda/layernorm_forward.cu | 2 +- dev/cuda/matmul_backward.cu | 2 +- dev/cuda/matmul_backward_bias.cu | 2 +- dev/cuda/nccl_all_reduce.cu | 2 +- dev/cuda/residual_forward.cu | 2 +- dev/cuda/softmax_forward.cu | 2 +- dev/cuda/trimat_forward.cu | 2 +- 18 files changed, 20 insertions(+), 20 deletions(-) diff --git a/dev/cuda/adamw.cu b/dev/cuda/adamw.cu index 23770b2c3..20a6560dd 100644 --- a/dev/cuda/adamw.cu +++ b/dev/cuda/adamw.cu @@ -6,8 +6,8 @@ References: * https://github.com/nvidia/apex/blob/master/csrc/multi_tensor_adam.cu Compile example: -nvcc adamw.cu -o adamw -nvcc -O3 --use_fast_math adamw.cu -o adamw +nvcc -lcublas -lcublasLt adamw.cu -o adamw +nvcc -O3 --use_fast_math -lcublas -lcublasLt adamw.cu -o adamw ./adamw diff --git a/dev/cuda/attention_backward.cu b/dev/cuda/attention_backward.cu index 8e673d79f..c97dbeee8 100644 --- a/dev/cuda/attention_backward.cu +++ b/dev/cuda/attention_backward.cu @@ -2,7 +2,7 @@ Kernels for attention backward pass. Compile example: -nvcc -O3 --use_fast_math attention_backward.cu -o attention_backward -lcublas +nvcc -O3 --use_fast_math -lcublas -lcublasLt attention_backward.cu -o attention_backward version 1 is a naive first version OMP_NUM_THREADS=32 ./attention_backward 1 diff --git a/dev/cuda/attention_forward.cu b/dev/cuda/attention_forward.cu index a7b6fff34..b632b4a66 100644 --- a/dev/cuda/attention_forward.cu +++ b/dev/cuda/attention_forward.cu @@ -6,10 +6,10 @@ If you do not have CUDNN, you can remove ENABLE_CUDNN to run the other kernels See the README for cuDNN install instructions Compile example with cuDNN: -nvcc -I/PATH/TO/cudnn-frontend/include -DENABLE_CUDNN -O3 --use_fast_math -lcublas -lcudnn attention_forward.cu -o attention_forward +nvcc -I/PATH/TO/cudnn-frontend/include -DENABLE_CUDNN -O3 --use_fast_math --lcublas -lcublasLt -lcudnn attention_forward.cu -o attention_forward Compile example without cuDNN: -nvcc -O3 --use_fast_math -lcublas attention_forward.cu -o attention_forward +nvcc -O3 --use_fast_math -lcublas -lcublasLt attention_forward.cu -o attention_forward version 1 is naive port from CPU code to kernel, parallelize over batch, time, heads only ./attention_forward 1 diff --git a/dev/cuda/crossentropy_forward.cu b/dev/cuda/crossentropy_forward.cu index 2385a6c4f..ca312ba36 100644 --- a/dev/cuda/crossentropy_forward.cu +++ b/dev/cuda/crossentropy_forward.cu @@ -2,7 +2,7 @@ Kernels for crossentropy forward pass. Compile example: -nvcc -O3 --use_fast_math crossentropy_forward.cu -o crossentropy_forward +nvcc -O3 --use_fast_math -lcublas -lcublasLt crossentropy_forward.cu -o crossentropy_forward version 1 is a straight-forward port from CPU code to kernel, parallel over B,T ./crossentropy_forward 1 diff --git a/dev/cuda/crossentropy_softmax_backward.cu b/dev/cuda/crossentropy_softmax_backward.cu index 164bceddf..27521bf60 100644 --- a/dev/cuda/crossentropy_softmax_backward.cu +++ b/dev/cuda/crossentropy_softmax_backward.cu @@ -2,7 +2,7 @@ Kernels for crossentropy forward pass. Compile example: -nvcc -O3 --use_fast_math crossentropy_softmax_backward.cu -o crossentropy_softmax_backward +nvcc -O3 --use_fast_math -lcublas -lcublasLt crossentropy_softmax_backward.cu -o crossentropy_softmax_backward version 1 is a straight-forward port from CPU code to kernel, parallel over B,T ./crossentropy_softmax_backward 1 diff --git a/dev/cuda/encoder_backward.cu b/dev/cuda/encoder_backward.cu index 8c96eaf46..53221878e 100644 --- a/dev/cuda/encoder_backward.cu +++ b/dev/cuda/encoder_backward.cu @@ -2,7 +2,7 @@ Kernels for the positional encoder forward pass in GPT-2. Compile example: -nvcc -O3 --use_fast_math encoder_backward.cu -o encoder_backward +nvcc -O3 --use_fast_math -lcublas -lcublasLt encoder_backward.cu -o encoder_backward version 1 is naive port from CPU code to kernel parallelizes over B,T,C, uses atomics to add to dwte, dwpe diff --git a/dev/cuda/encoder_forward.cu b/dev/cuda/encoder_forward.cu index e901fd654..39d5f0fa3 100644 --- a/dev/cuda/encoder_forward.cu +++ b/dev/cuda/encoder_forward.cu @@ -2,7 +2,7 @@ Kernels for the positional encoder forward pass in GPT-2. Compile example: -nvcc -O3 --use_fast_math encoder_forward.cu -o encoder_forward +nvcc -O3 --use_fast_math -lcublas -lcublasLt encoder_forward.cu -o encoder_forward version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C ./encoder_forward 1 diff --git a/dev/cuda/fused_residual_forward.cu b/dev/cuda/fused_residual_forward.cu index f228503af..b98a67c4b 100644 --- a/dev/cuda/fused_residual_forward.cu +++ b/dev/cuda/fused_residual_forward.cu @@ -2,7 +2,7 @@ Kernels for residual forward pass fused with layernorm Compile example: -nvcc -O3 --use_fast_math fused_residual_forward.cu -o fused_residual_forward +nvcc -O3 --use_fast_math -lcublas -lcublasLt fused_residual_forward.cu -o fused_residual_forward version 1 is naive port from CPU code to kernel ./fused_residual_forward 1 diff --git a/dev/cuda/gelu_backward.cu b/dev/cuda/gelu_backward.cu index bbd81c4bc..3d12dd864 100644 --- a/dev/cuda/gelu_backward.cu +++ b/dev/cuda/gelu_backward.cu @@ -2,7 +2,7 @@ Kernels for gelu backward pass. Compile example: -nvcc -O3 --use_fast_math gelu_backward.cu -o gelu_backward +nvcc -O3 --use_fast_math -lcublas -lcublasLt gelu_backward.cu -o gelu_backward If encountering "error: identifier "M_PI" is undefined", add the following lines to the top of the file: diff --git a/dev/cuda/gelu_forward.cu b/dev/cuda/gelu_forward.cu index e07ad663a..01abfe2b5 100644 --- a/dev/cuda/gelu_forward.cu +++ b/dev/cuda/gelu_forward.cu @@ -2,7 +2,7 @@ Kernels for gelu forward pass. Compile example: -nvcc -O3 --use_fast_math gelu_forward.cu -o gelu_forward +nvcc -O3 --use_fast_math -lcublas -lcublasLt gelu_forward.cu -o gelu_forward If encountering "error: identifier "M_PI" is undefined", add the following lines to the top of the file: diff --git a/dev/cuda/layernorm_backward.cu b/dev/cuda/layernorm_backward.cu index 904a57e0c..575e0a962 100644 --- a/dev/cuda/layernorm_backward.cu +++ b/dev/cuda/layernorm_backward.cu @@ -2,7 +2,7 @@ Kernels for layernorm backward pass. Compile example: -nvcc -O3 --use_fast_math layernorm_backward.cu -o layernorm_backward +nvcc -O3 --use_fast_math -lcublas -lcublasLt layernorm_backward.cu -o layernorm_backward version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C ./layernorm_backward 1 diff --git a/dev/cuda/layernorm_forward.cu b/dev/cuda/layernorm_forward.cu index 5cefd408e..3e948289a 100644 --- a/dev/cuda/layernorm_forward.cu +++ b/dev/cuda/layernorm_forward.cu @@ -2,7 +2,7 @@ Kernels for layernorm forward pass. Compile example: -nvcc -O3 --use_fast_math layernorm_forward.cu -o layernorm_forward +nvcc -O3 --use_fast_math -lcublas -lcublasLt layernorm_forward.cu -o layernorm_forward version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C ./layernorm_forward 1 diff --git a/dev/cuda/matmul_backward.cu b/dev/cuda/matmul_backward.cu index 9d3763930..dece1f6dc 100644 --- a/dev/cuda/matmul_backward.cu +++ b/dev/cuda/matmul_backward.cu @@ -2,7 +2,7 @@ Kernels for matmul backward pass. Compile example: -nvcc -O3 --use_fast_math -Xcompiler -fopenmp matmul_backward.cu -o matmul_backward -lcublas +nvcc -O3 --use_fast_math -lcublas -lcublasLt -Xcompiler -fopenmp matmul_backward.cu -o matmul_backward OMP_NUM_THREADS=32 ./matmul_backward 1 */ diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu index 7aef54547..65b331699 100644 --- a/dev/cuda/matmul_backward_bias.cu +++ b/dev/cuda/matmul_backward_bias.cu @@ -2,7 +2,7 @@ Kernels for matmul backward pass bias only. Compile example: -nvcc -O3 matmul_backward_bias.cu -lineinfo -o matmul_backward_bias +nvcc -O3 -lcublas -lcublasLt matmul_backward_bias.cu -lineinfo -o matmul_backward_bias ./matmul_backward_bias 1 ./matmul_backward_bias 2 diff --git a/dev/cuda/nccl_all_reduce.cu b/dev/cuda/nccl_all_reduce.cu index 3bc9564f1..260ba02ba 100644 --- a/dev/cuda/nccl_all_reduce.cu +++ b/dev/cuda/nccl_all_reduce.cu @@ -5,7 +5,7 @@ Fills a vector with 1s on the first GPU, 2s on the second, etc. Then aggregates the values in the resulting vectors. Compile example: -nvcc -lmpi -lnccl -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-gnu/openmpi/lib/ nccl_all_reduce.cu -o nccl_all_reduce +nvcc -lmpi -lnccl -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-gnu/openmpi/lib/ -lcublas -lcublasLt nccl_all_reduce.cu -o nccl_all_reduce Run on 2 local GPUs (set -np to a different value to change GPU count): mpirun -np 2 ./nccl_all_reduce diff --git a/dev/cuda/residual_forward.cu b/dev/cuda/residual_forward.cu index f07871a29..fd7d1fb8e 100644 --- a/dev/cuda/residual_forward.cu +++ b/dev/cuda/residual_forward.cu @@ -2,7 +2,7 @@ Kernels for residual forward pass. Compile example: -nvcc -O3 --use_fast_math residual_forward.cu -o residual_forward +nvcc -O3 --use_fast_math -lcublas -lcublasLt residual_forward.cu -o residual_forward version 1 is naive port from CPU code to kernel ./residual_forward 1 diff --git a/dev/cuda/softmax_forward.cu b/dev/cuda/softmax_forward.cu index d0d38850d..279549b28 100644 --- a/dev/cuda/softmax_forward.cu +++ b/dev/cuda/softmax_forward.cu @@ -2,7 +2,7 @@ Kernels for softmax forward pass. Compile example: -nvcc -O3 --use_fast_math softmax_forward.cu -o softmax_forward +nvcc -O3 --use_fast_math -lcublas -lcublasLt softmax_forward.cu -o softmax_forward version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C ./softmax_forward 1 diff --git a/dev/cuda/trimat_forward.cu b/dev/cuda/trimat_forward.cu index 133ced16f..1c093e2a1 100644 --- a/dev/cuda/trimat_forward.cu +++ b/dev/cuda/trimat_forward.cu @@ -3,7 +3,7 @@ Triangular matrix multiplication as in autoregressive attention. A short story. by @ngc92 Compile: -nvcc -O3 --use_fast_math trimat_forward.cu -o trimat_forward -lcublas +nvcc -O3 --use_fast_math -lcublas -lcublasLt trimat_forward.cu -o trimat_forward -lcublas Run: From 6da5e63e2c25ba6c53d8e16438fcf595f81ac44d Mon Sep 17 00:00:00 2001 From: Marco van Zwetselaar Date: Fri, 10 May 2024 10:57:39 +0300 Subject: [PATCH 053/172] Fix detection of cudnn-frontend in '.' on Linux Plus minor legibility fixes in the cudnn-frontend explanation. --- Makefile | 10 ++++------ README.md | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index eabb5646d..46abdc9a5 100644 --- a/Makefile +++ b/Makefile @@ -19,8 +19,7 @@ NVCC_INCLUDES = NVCC_LDLIBS = NCLL_INCUDES = NVCC_CUDNN = -# overridable flag for multi-GPU training. by default we won't build with cudnn -# because it bloats up the compile time from a few seconds to ~minute +# By default we don't build with cudnn because it blows up compile time from a few seconds to ~minute USE_CUDNN ?= 0 # Function to check if a file exists in the PATH @@ -86,16 +85,15 @@ else endif # Check and include cudnn if available -# You can override the path to cudnn frontend by setting CUDNN_FRONTEND_PATH=your_path on the make command line +# You can override the path to cudnn frontend by setting CUDNN_FRONTEND_PATH on the make command line +# By default, we look for it in HOME/cudnn-frontend/include and ./cudnn-frontend/include # Refer to the README for cuDNN install instructions ifeq ($(USE_CUDNN), 1) ifeq ($(SHELL_UNAME), Linux) - # hard-coded path for now in either . or ($HOME) directory - # this can be overridden by setting CUDNN_FRONTEND_PATH on the command line ifeq ($(shell [ -d $(HOME)/cudnn-frontend/include ] && echo "exists"), exists) $(info ✓ cuDNN found, will run with flash-attention) CUDNN_FRONTEND_PATH ?= $(HOME)/cudnn-frontend/include - else ifeq ($(shell [ -d cudnn-frontend/include ] && echo "exists"),) + else ifeq ($(shell [ -d cudnn-frontend/include ] && echo "exists"), exists) $(info ✓ cuDNN found, will run with flash-attention) CUDNN_FRONTEND_PATH ?= cudnn-frontend/include else diff --git a/README.md b/README.md index f87161261..b4298acff 100644 --- a/README.md +++ b/README.md @@ -265,7 +265,7 @@ sudo apt-get update sudo apt-get -y install libcudnn9-dev-cuda-12 ``` -On top of this you need the [cuDNN frontend](https://github.com/NVIDIA/cudnn-frontend/tree/main), but this is just header files. So simply download the repo to your disk, currently assumed to be in your home directory (i.e. the Makefile looks for `~/cudnn-frontend/include`). +On top of this you need the [cuDNN frontend](https://github.com/NVIDIA/cudnn-frontend/tree/main), but this is just header files. Simply clone the repo to your disk. The Makefile currently looks for it in either your home directory or the current directory. If you have put it elsewhere, add `CUDNN_FRONTEND_PATH=/path/to/your/cudnn-frontend/include` to the `make` command-line. **Multi-GPU training**. As of April 26, 2024 there is now also support for multi-GPU training using MPI and NCCL. Make sure you install MPI, e.g. on Linux: From 3dbb0bb89cd8115c25bd4f423bed390ea011fd8b Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sat, 11 May 2024 17:19:41 +0000 Subject: [PATCH 054/172] bump the threshold for qkvw because flashattention expands the error here a tiny bit --- test_gpt2.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_gpt2.cu b/test_gpt2.cu index 7613b6ba3..654e35db1 100644 --- a/test_gpt2.cu +++ b/test_gpt2.cu @@ -247,7 +247,7 @@ int main(int argc, char *argv[]) { // In that case it's ok to extend the tolerance by a bit, after a manual review. allok = allok & check_tensor(tensors1[0], tensors2[0], V * C, "wte", 8e-1f); allok = allok & check_tensor(tensors1[1], tensors2[1], maxT * C, "wpe", 1e-2f); - allok = allok & check_tensor(tensors1[2], tensors2[2], L * 3*C * C, "qkvw", 1.1e-1); // hmm a bit high + allok = allok & check_tensor(tensors1[2], tensors2[2], L * 3*C * C, "qkvw", 1.4e-1); // hmm a bit high allok = allok & check_tensor(tensors1[3], tensors2[3], L * 3*C, "qkvb", 4e-2f); allok = allok & check_tensor(tensors1[4], tensors2[4], L * C * C, "attprojw", 3e-2f); allok = allok & check_tensor(tensors1[5], tensors2[5], L * C, "attprojb", 3e-2f); From b88f683569423adae51d878f78bfe71df973c6c1 Mon Sep 17 00:00:00 2001 From: Ross Wheeler Date: Sat, 11 May 2024 17:49:22 -0700 Subject: [PATCH 055/172] Add Windows to CI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change notes: • Add Windows build/test to CI matrix build • Add Cuda Windows build • Replace v3 checkout actions to v4 per GitHub recommendations --- .github/workflows/ci.yml | 100 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 94 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 52715eb9c..bb19f2ba5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,15 +12,16 @@ jobs: build-and-test-cpu: strategy: matrix: - os: [ubuntu-latest, macos-latest] + os: [ubuntu-latest, macos-latest, windows-latest] runs-on: ${{ matrix.os }} steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install OpenMP + if: matrix.os != 'windows-latest' run: | if [ "${{ runner.os }}" == "Linux" ]; then sudo apt-get update && sudo apt-get install -y libomp-dev @@ -37,18 +38,105 @@ jobs: - name: Train model run: python train_gpt2.py --device=cpu + - name: Download Win32 Make.exe + if: matrix.os == 'windows-latest' + run: | + $wc = New-Object System.Net.WebClient + $url = 'https://github.com/maweil/MakeForWindows/releases/download/v4.4.1/make-bin-win64.zip' + $output = './make-bin-win64.zip' + $wc.DownloadFile($url, $output) + + - name: Unzip Win32 Makefile + if: matrix.os == 'windows-latest' + run: | + unzip make-bin-win64.zip + - name: Compile training and testing program + if: matrix.os != 'windows-latest' run: make test_gpt2 train_gpt2 + - name: Compile training and testing program for Windows + if: matrix.os == 'windows-latest' + shell: cmd + run: | + call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\VC\\Auxiliary\\Build\\vcvars64.bat" + make-4.4.1\dist\make WIN_CI_BUILD=1 test_gpt2 train_gpt2 + - name: Execute testing program (With OpenMP) + if: matrix.os != 'windows-latest' run: OMP_NUM_THREADS=8 ./test_gpt2 + - name: Execute Windows testing program (With OpenMP) + if: matrix.os == 'windows-latest' + shell: cmd + run: | + copy test_gpt2 test_gpt2.exe + test_gpt2.exe + - name: Compile training and testing program without OpenMP + if: matrix.os != 'windows-latest' run: NO_OMP=1 make test_gpt2 train_gpt2 - name: Execute testing program (No OpenMP) + if: matrix.os != 'windows-latest' run: ./test_gpt2 + build-cuda-windows: + runs-on: windows-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Download Win32 Make.exe + run: | + $wc = New-Object System.Net.WebClient + $url = 'https://github.com/maweil/MakeForWindows/releases/download/v4.4.1/make-bin-win64.zip' + $output = './make-bin-win64.zip' + $wc.DownloadFile($url, $output) + + - name: Unzip Win32 Makefile + run: | + unzip make-bin-win64.zip + + - name: Install Cuda Toolkit 12.4 on Windows + run: | + mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" + choco install unzip -y + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip" + unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + + # Default installation path for CUDA Toolkit is C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4 + - name: Add Path + run: | + echo "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.4\\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 + echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 + + - name: Build Cuda targets + shell: cmd + working-directory: ${{ github.workspace }} + run: | + call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\VC\\Auxiliary\\Build\\vcvars64.bat" + make-4.4.1\dist\make -j WIN_CI_BUILD=1 train_gpt2fp32cu test_gpt2fp32cu test_gpt2cu train_gpt2cu profile_gpt2cu + build-cuda-fp32: runs-on: ubuntu-latest container: @@ -56,7 +144,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Build FP32 checkpoint run: make train_gpt2fp32cu test_gpt2fp32cu @@ -71,7 +159,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Build project run: PRECISION=BF16 make test_gpt2cu train_gpt2cu profile_gpt2cu @@ -83,7 +171,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Build project run: PRECISION=FP16 make test_gpt2cu train_gpt2cu profile_gpt2cu @@ -95,7 +183,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install OpenMP and OpenMPI run: apt-get update && apt-get install -y libomp-dev libopenmpi-dev From e64df911910bfd1f235d8f0e0e97478c67d16e53 Mon Sep 17 00:00:00 2001 From: ntr Date: Sun, 12 May 2024 16:02:36 +0200 Subject: [PATCH 056/172] Add Llm.cs by nietras to README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index b4298acff..41cef25af 100644 --- a/README.md +++ b/README.md @@ -344,6 +344,7 @@ Lastly, I will be a lot more sensitive to complexity in the root folder of the p - C# - [llm.cs](https://github.com/azret/llm.cs) by @[azret](https://github.com/azret): a C# port of this project + - [Llm.cs](https://github.com/nietras/Llm.cs) by @[nietras](https://github.com/nietras): a C# port of this project with focus on easy to get started on any platform. Clone and run ✅ - CUDA C++ - [llm.cpp](https://github.com/gevtushenko/llm.c) by @[gevtushenko](https://github.com/gevtushenko): a port of this project using the [CUDA C++ Core Libraries](https://github.com/NVIDIA/cccl) From ec92368f8b34dda6e5821556db0581dde9e0a75f Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Sun, 12 May 2024 19:24:43 +0300 Subject: [PATCH 057/172] added current backward bias kernel to dev cuda --- dev/cuda/matmul_backward_bias.cu | 95 ++++++++++++++++++++++++++++++++ train_gpt2.cu | 4 +- 2 files changed, 97 insertions(+), 2 deletions(-) diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu index 65b331699..233b7a197 100644 --- a/dev/cuda/matmul_backward_bias.cu +++ b/dev/cuda/matmul_backward_bias.cu @@ -22,6 +22,8 @@ sudo ncu --set full --import-source yes -o bias -f ./matmul_backward_bias 1 #include #include #include + +//#define ENABLE_BF16 #include "common.h" // ---------------------------------------------------------------------------- @@ -45,6 +47,8 @@ void matmul_backward_bias_cpu(float* dinp, float* dweight, float* dbias, // ---------------------------------------------------------------------------- // GPU kernels +float* dbias_buffer; + __global__ void matmul_backward_bias_kernel1(float* dbias, const float* dout, int B, int T, int OC) { extern __shared__ float shared[]; int o = blockIdx.x; // range [0, OC) @@ -180,6 +184,66 @@ __global__ void matmul_backward_bias_kernel5(float* dbias, const float* dout, in } +__global__ void cast_and_add_kernel(floatX* dst, const float* src, size_t n) { + // used only for matmul_backward_bias kernel, a little bit embarassing TODO delete later + const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { dst[idx] = (floatX)((float)dst[idx] + src[idx]); } // have to += because dbias is a paramater +} + +__global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, int B, int T, int OC, const int block_size) { + // note: this kernel reads in floatX, but it writes to float! + // this is because we're using atomics, which are super slow in < fp32 precision on < H100 GPUs + // so the trick is do fp32 atomics to a buffer, and then copy_and_cast the result to floatX + // (this also results in higher accuracy than doing accumulation directly in floatX) + + // see comments in matmul_backward() for an explanation of block/grid dimensions etc. + const int block_size_x = 32; + const int block_size_y = block_size / block_size_x; // 16 + const int OC_per_warp = block_size_x * x128::size; // 256 at BF16 + + int local_oc = threadIdx.x * x128::size; + int global_oc = blockIdx.x * OC_per_warp + local_oc; + float accumulators[x128::size]; + extern __shared__ float shared[]; + + for (int k = 0; k < x128::size; k++) { + accumulators[k] = 0.0f; + } + int thread_id = threadIdx.y * block_size_x + threadIdx.x; + for (int idx = thread_id; idx < OC_per_warp; idx += block_size) { + shared[idx] = 0.0f; + } + __syncthreads(); + if(global_oc < OC) { + for (int idx = blockIdx.y*block_size_y + threadIdx.y; idx < B * T; idx += gridDim.y*block_size_y) { + x128 packed_dout = load128(dout + global_oc + idx*OC); + for (int k = 0; k < x128::size; k++) { + accumulators[k] += (float)packed_dout[k]; + } + } + // we need to avoid shared memory bank conflicts for the atomicAdd to maximise performance, + // so we accumulate in a conflict-free order, then reorder to match the global memory order + for (int k = 0; k < x128::size; k++) { + atomicAdd(shared + threadIdx.x + (k * block_size_x), accumulators[k]); + } + } + if (threadIdx.y >= x128::size) { return; } // only need this many warps to reorder the data + __syncthreads(); + // read the accumulated values in the conflict-free order + int i = threadIdx.x + (threadIdx.y * block_size_x); + float tmp = shared[i]; + __syncthreads(); + // write them back to shared memory in the global memory order + // 8-way bank conflict for BF16 x128, but only 8x per threadblock (rather than 8x per warp) + shared[local_oc + threadIdx.y] = tmp; + __syncthreads(); + // now we do a perfectly coalesced atomic add to global memory (1x 128-byte cacheline per warp) + if (i + blockIdx.x*OC_per_warp < OC) { + atomicAdd(dbias + i + blockIdx.x*OC_per_warp, shared[i]); + } +} + + // ---------------------------------------------------------------------------- // kernel launcher @@ -224,6 +288,33 @@ void matmul_backward_bias5(float* dinp, float* dweight, float* dbias, matmul_backward_bias_kernel5<<>>(dbias, dout, B, T, OC); } +void matmul_backward_bias7(float* dinp, float* dweight, float* dbias, + float* dout, float* inp, float* weight, float* ones, + int B, int T, int C, int OC, int block_size) { + if(block_size < 128) { + block_size = 128; + } + // Each warp is responsible for 32 * "x128::size" = 256 OCs at BF16 (OC must be a multiple of 256!) + // Block size is 512 threads (16 warps) and we reduce those 16 values into 1 at the end + // blockDim.x is 32 --> single warp being responsible for those 256 OCs + // blockDim.y is 16 --> 16 parallel independent warps processing the same OCs for different BTs + // gridDim.x is OC / 256 --> each block processes 256 OCs + // grimDim.y is max(1, (cuda_num_SMs * threads_per_SM) / (512 * gridDim.x)); --> fill up the entire GPU! + const int warp_size = 32; + const int OC_per_warp = warp_size * x128::size; // 256 at BF16 + const int block_size_x = 32; + const int block_size_y = block_size / block_size_x; // 16 + const int grid_size_x = ceil_div(OC, OC_per_warp); // e.g. 3 horizontal blocks for 768 OCs at BF16 + const int grid_size_y = max(1, cuda_threads_per_SM * cuda_num_SMs / (block_size * grid_size_x)); // full GPU! + + assert(block_size_y >= x128::size); // part of the kernel assumes this is large enough to avoid loops + + cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float)); + matmul_backward_bias_kernel7<<>>(dbias_buffer, dout, B, T, OC, block_size); + cast_and_add_kernel<<>>(dbias, dbias_buffer, OC); +} + void matmul_backward_bias(int kernel_num, float* dinp, float* dweight, float* dbias, float* dout, float* inp, float* weight, float* ones, @@ -244,6 +335,9 @@ void matmul_backward_bias(int kernel_num, case 5: matmul_backward_bias5(dinp, dweight, dbias, dout, inp, weight, ones, B, T, C, OC, block_size); break; + case 7: + matmul_backward_bias7(dinp, dweight, dbias, dout, inp, weight, ones, B, T, C, OC, block_size); + break; default: printf("Invalid kernel number\n"); exit(1); @@ -276,6 +370,7 @@ int main(int argc, char **argv) { float* d_dout; cudaCheck(cudaMalloc(&d_dbias, OC * sizeof(float))); cudaCheck(cudaMalloc(&d_dout, B * T * OC * sizeof(float))); + cudaCheck(cudaMalloc(&dbias_buffer, OC * sizeof(float))); cudaCheck(cudaMemcpy(d_dbias, dbias, OC * sizeof(float), cudaMemcpyHostToDevice)); cudaCheck(cudaMemcpy(d_dout, dout, B * T * OC * sizeof(float), cudaMemcpyHostToDevice)); diff --git a/train_gpt2.cu b/train_gpt2.cu index 3285aa1d5..1c6cdb136 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -869,7 +869,7 @@ __global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, i // note: this kernel reads in floatX, but it writes to float! // this is because we're using atomics, which are super slow in < fp32 precision on < H100 GPUs // so the trick is do fp32 atomics to a buffer, and then copy_and_cast the result to floatX - // (this also results in higher accuracy than doing doing accumulation directly in floatX) + // (this also results in higher accuracy than doing accumulation directly in floatX) // see comments in matmul_backward() for an explanation of block/grid dimensions etc. const int block_size = 512; @@ -897,7 +897,7 @@ __global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, i accumulators[k] += (float)packed_dout[k]; } } - // we need to avoid shared memory bank conflicts for the atomicAdd to maximise performance + // we need to avoid shared memory bank conflicts for the atomicAdd to maximise performance, // so we accumulate in a conflict-free order, then reorder to match the global memory order for (int k = 0; k < x128::size; k++) { atomicAdd(shared + threadIdx.x + (k * block_size_x), accumulators[k]); From 2287da01207d8d4d92eaeb12cee0f2093dd46a6a Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Sun, 12 May 2024 19:42:32 +0300 Subject: [PATCH 058/172] enable bf16 --- dev/cuda/matmul_backward_bias.cu | 89 ++++++++++++++------------------ 1 file changed, 40 insertions(+), 49 deletions(-) diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu index 233b7a197..024eca4d6 100644 --- a/dev/cuda/matmul_backward_bias.cu +++ b/dev/cuda/matmul_backward_bias.cu @@ -23,7 +23,7 @@ sudo ncu --set full --import-source yes -o bias -f ./matmul_backward_bias 1 #include #include -//#define ENABLE_BF16 +#define ENABLE_BF16 #include "common.h" // ---------------------------------------------------------------------------- @@ -49,16 +49,16 @@ void matmul_backward_bias_cpu(float* dinp, float* dweight, float* dbias, float* dbias_buffer; -__global__ void matmul_backward_bias_kernel1(float* dbias, const float* dout, int B, int T, int OC) { +__global__ void matmul_backward_bias_kernel1(floatX* dbias, const floatX* dout, int B, int T, int OC) { extern __shared__ float shared[]; int o = blockIdx.x; // range [0, OC) int tid = threadIdx.x; // range [0, block_size) int block_size = blockDim.x; - const float* x = dout + o; + const floatX* x = dout + o; // thread coarsening float sum = 0.0; for (int i = tid; i < B * T; i += block_size) { - sum += x[i * OC]; + sum += (float)x[i * OC]; } shared[tid] = sum; __syncthreads(); @@ -71,12 +71,12 @@ __global__ void matmul_backward_bias_kernel1(float* dbias, const float* dout, in } // write the final result (at thread 0) to global memory if (tid == 0) { - dbias[o] += shared[0]; + dbias[o] = (float)dbias[o] + shared[0]; } } // cooperative groups solution, one warp per output channel -__global__ void matmul_backward_bias_kernel2(float* dbias, const float* dout, int B, int T, int OC) { +__global__ void matmul_backward_bias_kernel2(floatX* dbias, const floatX* dout, int B, int T, int OC) { // dout is (B, T, OC), dbias is (OC) // e.g. if block_size = 128, then we have 4 warps per block, each in charge of one output channel namespace cg = cooperative_groups; @@ -89,7 +89,7 @@ __global__ void matmul_backward_bias_kernel2(float* dbias, const float* dout, in // first, thread coarsening to sum reduce the problem size from B*T to 32 float sum = 0.0f; for(int i = warp.thread_rank(); i < BT; i += warp.size()) { - sum += dout[i * OC + idx]; + sum += (float)dout[i * OC + idx]; } // now do a warp-level reduce to get the sum across the 32 threads in this warp sum = cg::reduce(warp, sum, cg::plus{}); @@ -99,7 +99,7 @@ __global__ void matmul_backward_bias_kernel2(float* dbias, const float* dout, in } } -__global__ void matmul_backward_bias_kernel3(float* dbias, const float* dout, int B, int T, int OC) { +__global__ void matmul_backward_bias_kernel3(floatX* dbias, const floatX* dout, int B, int T, int OC) { // dout is (B, T, OC), dbias is (OC) // in this version of the kernel the entire block of block_size is dedicated to one output channel namespace cg = cooperative_groups; @@ -114,7 +114,7 @@ __global__ void matmul_backward_bias_kernel3(float* dbias, const float* dout, in // round 1: thread coarsening to reduce the problem size from B*T to 32 float thread_sum = 0.0f; for(int i = threadIdx.x; i < BT; i += blockDim.x) { - thread_sum += dout[i * OC + idx]; + thread_sum += (float)dout[i * OC + idx]; } // now do a warp-level reduce to get the sum across the 32 threads in each warp float warp_sum = cg::reduce(warp, thread_sum, cg::plus{}); @@ -136,7 +136,7 @@ __global__ void matmul_backward_bias_kernel3(float* dbias, const float* dout, in // the idea is to employ one block to reduce along several columns, // where each block has a width of 32 columns to ensure coalesced access. // at the end we accumulate the reductions performed by the warps in each block via shared memory -__global__ void matmul_backward_bias_kernel4(float* dbias, const float* dout, int B, int T, int OC) { +__global__ void matmul_backward_bias_kernel4(floatX* dbias, const floatX* dout, int B, int T, int OC) { // this kernel is launched with 1D grid_dim of OC/32 // for example let's say block_size is 128 extern __shared__ float smem[]; // of size block_size (128) @@ -147,7 +147,7 @@ __global__ void matmul_backward_bias_kernel4(float* dbias, const float* dout, in // pointer to the start of the column for one lane of threads // so e.g. 4 threads (of the same lane_id) will reduce this one column - const float* dout_col = dout + tl + lane_id; + const floatX* dout_col = dout + tl + lane_id; // column reductions by looping through the rows // each of the 4 threads offsets by its warp_id and then skips by vstep @@ -156,7 +156,7 @@ __global__ void matmul_backward_bias_kernel4(float* dbias, const float* dout, in // leading to a coalesced memory access pattern float dout_sum = 0.0f; for (int row = warp_id; row < B * T; row += vstep) { - dout_sum += dout_col[row * OC]; + dout_sum += (float)dout_col[row * OC]; } smem[lane_id + warp_id * warpSize] = dout_sum; __syncthreads(); @@ -171,13 +171,13 @@ __global__ void matmul_backward_bias_kernel4(float* dbias, const float* dout, in } } -__global__ void matmul_backward_bias_kernel5(float* dbias, const float* dout, int B, int T, int OC) { +__global__ void matmul_backward_bias_kernel5(floatX* dbias, const floatX* dout, int B, int T, int OC) { int oc = blockIdx.x * blockDim.x + threadIdx.x; if(oc >= OC) return; float sum = 0.0; // grid-wide loop for maximum parallelism for (int i = blockIdx.y; i < B * T; i += gridDim.y) { - sum += dout[i * OC + oc]; + sum += (float)dout[i * OC + oc]; } // and atomically add everything together. atomics within one block are conflict-free! atomicAdd(dbias + oc, sum); @@ -248,8 +248,7 @@ __global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, i // kernel launcher // version1: simple cuBLAS calls -void matmul_backward_bias1(float* dinp, float* dweight, float* dbias, - float* dout, float* inp, float* weight, float* ones, +void matmul_backward_bias1(floatX* dbias, floatX* dout, int B, int T, int C, int OC, int block_size) { dim3 block_dim(block_size); dim3 grid_dim(OC); @@ -257,42 +256,37 @@ void matmul_backward_bias1(float* dinp, float* dweight, float* dbias, matmul_backward_bias_kernel1<<>>(dbias, dout, B, T, OC); } -void matmul_backward_bias2(float* dinp, float* dweight, float* dbias, - float* dout, float* inp, float* weight, float* ones, +void matmul_backward_bias2(floatX* dbias, floatX* dout, int B, int T, int C, int OC, int block_size) { // block_size 512 seems best const int grid_size = ceil_div(OC * 32, block_size); matmul_backward_bias_kernel2<<>>(dbias, dout, B, T, OC); } -void matmul_backward_bias3(float* dinp, float* dweight, float* dbias, - float* dout, float* inp, float* weight, float* ones, +void matmul_backward_bias3(floatX* dbias, floatX* dout, int B, int T, int C, int OC, int block_size) { // block_size 256 seems best matmul_backward_bias_kernel3<<>>(dbias, dout, B, T, OC); } -void matmul_backward_bias4(float* dinp, float* dweight, float* dbias, - float* dout, float* inp, float* weight, float* ones, +void matmul_backward_bias4(floatX* dbias, floatX* dout, int B, int T, int C, int OC, int block_size) { assert(OC % 32 == 0); // OC must be divisible by 32 for this kernel const int grid_size = OC / 32; matmul_backward_bias_kernel4<<>>(dbias, dout, B, T, OC); } -void matmul_backward_bias5(float* dinp, float* dweight, float* dbias, - float* dout, float* inp, float* weight, float* ones, +void matmul_backward_bias5(floatX* dbias, floatX* dout, int B, int T, int C, int OC, int block_size) { const int grid_size_x = ceil_div(OC, block_size); const int grid_size_y = max(1, cuda_threads_per_SM * cuda_num_SMs / block_size); matmul_backward_bias_kernel5<<>>(dbias, dout, B, T, OC); } -void matmul_backward_bias7(float* dinp, float* dweight, float* dbias, - float* dout, float* inp, float* weight, float* ones, +void matmul_backward_bias7(floatX* dbias, floatX* dout, int B, int T, int C, int OC, int block_size) { - if(block_size < 128) { - block_size = 128; + if(block_size < 256) { + block_size = 256; } // Each warp is responsible for 32 * "x128::size" = 256 OCs at BF16 (OC must be a multiple of 256!) // Block size is 512 threads (16 warps) and we reduce those 16 values into 1 at the end @@ -315,28 +309,26 @@ void matmul_backward_bias7(float* dinp, float* dweight, float* dbias, cast_and_add_kernel<<>>(dbias, dbias_buffer, OC); } -void matmul_backward_bias(int kernel_num, - float* dinp, float* dweight, float* dbias, - float* dout, float* inp, float* weight, float* ones, +void matmul_backward_bias(int kernel_num, floatX* dbias, floatX* dout, int B, int T, int C, int OC, int block_size) { switch (kernel_num) { case 1: - matmul_backward_bias1(dinp, dweight, dbias, dout, inp, weight, ones, B, T, C, OC, block_size); + matmul_backward_bias1(dbias, dout, B, T, C, OC, block_size); break; case 2: - matmul_backward_bias2(dinp, dweight, dbias, dout, inp, weight, ones, B, T, C, OC, block_size); + matmul_backward_bias2(dbias, dout, B, T, C, OC, block_size); break; case 3: - matmul_backward_bias3(dinp, dweight, dbias, dout, inp, weight, ones, B, T, C, OC, block_size); + matmul_backward_bias3(dbias, dout, B, T, C, OC, block_size); break; case 4: - matmul_backward_bias4(dinp, dweight, dbias, dout, inp, weight, ones, B, T, C, OC, block_size); + matmul_backward_bias4(dbias, dout, B, T, C, OC, block_size); break; case 5: - matmul_backward_bias5(dinp, dweight, dbias, dout, inp, weight, ones, B, T, C, OC, block_size); + matmul_backward_bias5(dbias, dout, B, T, C, OC, block_size); break; case 7: - matmul_backward_bias7(dinp, dweight, dbias, dout, inp, weight, ones, B, T, C, OC, block_size); + matmul_backward_bias7(dbias, dout, B, T, C, OC, block_size); break; default: printf("Invalid kernel number\n"); @@ -366,13 +358,13 @@ int main(int argc, char **argv) { float* dout = make_random_float(B * T * OC); // move to GPU - float* d_dbias; - float* d_dout; - cudaCheck(cudaMalloc(&d_dbias, OC * sizeof(float))); - cudaCheck(cudaMalloc(&d_dout, B * T * OC * sizeof(float))); + floatX* d_dbias; + floatX* d_dout; + cudaCheck(cudaMalloc(&d_dbias, OC * sizeof(floatX))); + cudaCheck(cudaMalloc(&d_dout, B * T * OC * sizeof(floatX))); cudaCheck(cudaMalloc(&dbias_buffer, OC * sizeof(float))); - cudaCheck(cudaMemcpy(d_dbias, dbias, OC * sizeof(float), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(d_dout, dout, B * T * OC * sizeof(float), cudaMemcpyHostToDevice)); + cudaCheck(memcpy_convert(d_dbias, dbias, OC)); + cudaCheck(memcpy_convert(d_dout, dout, B * T * OC)); // ncu debugging / profiling, do a single call // int block_size_debug; @@ -391,23 +383,22 @@ int main(int argc, char **argv) { for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) { int block_size = block_sizes[j]; // memset the bias to zero - cudaCheck(cudaMemset(d_dbias, 0, OC * sizeof(float))); + cudaCheck(cudaMemset(d_dbias, 0, OC * sizeof(floatX))); // calculate the GPU version - matmul_backward_bias(kernel_num, NULL, NULL, d_dbias, d_dout, NULL, NULL, NULL, B, T, C, OC, block_size); + matmul_backward_bias(kernel_num, d_dbias, d_dout, B, T, C, OC, block_size); // compare printf("Checking correctness...\n"); - validate_result(d_dbias, dbias, "dbias", OC, 5e-3f); + float tol = std::is_same_v ? 5e-3f : 1.0f; + validate_result(d_dbias, dbias, "dbias", OC, tol); printf("All results match for block_size=%d.\n\n", block_size); } // now benchmark the kernel for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) { int block_size = block_sizes[j]; - float *d_dinp, *d_dweight, *d_inp, *d_weight, *d_ones; int repeat_times = 2000; float elapsed_time = benchmark_kernel(repeat_times, matmul_backward_bias, kernel_num, - d_dinp, d_dweight, d_dbias, d_dout, d_inp, d_weight, d_ones, - B, T, C, OC, block_size); + d_dbias, d_dout, B, T, C, OC, block_size); printf("block_size %d time %.4f ms\n", block_size, elapsed_time); } From b3a5d1da15e63c955af840cc395930ccccfe3ee4 Mon Sep 17 00:00:00 2001 From: chinthysl Date: Mon, 13 May 2024 05:55:46 +0000 Subject: [PATCH 059/172] shard master_weights --- train_gpt2.cu | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index bd11734d9..ad4f9519c 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -2372,26 +2372,23 @@ void gpt2_multi_gpu_update(GPT2 *model, float learning_rate, float beta1, float printf0("allocated %zu MiB for AdamW optimizer state m\n", (multi_gpu_config->shard_num_parameters * sizeof(float)) >> 20); printf0("allocated %zu MiB for AdamW optimizer state v\n", (multi_gpu_config->shard_num_parameters * sizeof(float)) >> 20); if (model->use_master_weights == 1) { - cudaCheck(cudaMalloc((void**)&model->master_weights, model->num_parameters * sizeof(float))); - copy_and_cast_kernel<<num_parameters, 512), 512, 0, main_stream>>>(model->master_weights, (floatX*)model->params_memory, model->num_parameters); + cudaCheck(cudaMalloc((void**)&model->master_weights, multi_gpu_config->shard_num_parameters * sizeof(float))); + copy_and_cast_kernel<<shard_num_parameters, 512), 512, 0, main_stream>>>( + model->master_weights, (floatX*)model->params_memory, multi_gpu_config->shard_num_parameters); cudaCheck(cudaGetLastError()); - printf0("allocated %zu MiB for master copy of params\n", (model->num_parameters * sizeof(float)) >> 20); + printf0("allocated %zu MiB for master copy of params\n", (multi_gpu_config->shard_num_parameters * sizeof(float)) >> 20); } } floatX* params_memory = (floatX*)model->params_memory + multi_gpu_config->shard_offset; floatX* grads_memory = (floatX*)model->grads_memory + multi_gpu_config->shard_offset; - float* master_weights = NULL; - if (model->use_master_weights == 1) { - master_weights = model->master_weights + multi_gpu_config->shard_offset; - } int block_size = 512; int num_blocks = CEIL_DIV(multi_gpu_config->shard_num_parameters, block_size); float beta1_correction = 1.0f - powf(beta1, t); float beta2_correction = 1.0f - powf(beta2, t); unsigned int seed = random_u32(&model->rng_state); - adamw_kernel3<<>>(params_memory, master_weights, grads_memory, + adamw_kernel3<<>>(params_memory, model->master_weights, grads_memory, model->m_memory, model->v_memory, multi_gpu_config->shard_num_parameters, learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed); cudaCheck(cudaGetLastError()); @@ -2403,20 +2400,11 @@ void gpt2_multi_gpu_gather(GPT2 *model, MultiGpuConfig* multi_gpu_config) if (multi_gpu_config->num_processes == 1) return; if (multi_gpu_config->zero_stage == 1) { - // gather all parameter updates from each process - if (model->use_master_weights == 1) { - ncclCheck(ncclAllGather(model->master_weights + multi_gpu_config->shard_offset, model->master_weights, - multi_gpu_config->shard_num_parameters, ncclFloat, - multi_gpu_config->nccl_comm, 0)); - // Copy and cast master weights to params - copy_and_cast_kernel<<num_parameters, 512), 512>>>((floatX*)model->params_memory, model->master_weights, model->num_parameters); - } - else { - ncclCheck(ncclAllGather((floatX*)model->params_memory + multi_gpu_config->shard_offset, (floatX*)model->params_memory, - multi_gpu_config->shard_num_parameters, ncclFloatX, - multi_gpu_config->nccl_comm, 0)); - } - } + // gather updated shards of model->params_memory from each process + ncclCheck(ncclAllGather((floatX*)model->params_memory + multi_gpu_config->shard_offset, (floatX*)model->params_memory, + multi_gpu_config->shard_num_parameters, ncclFloatX, + multi_gpu_config->nccl_comm, 0)); + } cudaCheck(cudaGetLastError()); #endif } From 086ce2ff4de9752b34b3f098c648230f04684e1e Mon Sep 17 00:00:00 2001 From: chinthysl Date: Mon, 13 May 2024 06:48:22 +0000 Subject: [PATCH 060/172] Remove unsused template specializations and refactor --- train_gpt2.cu | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index ad4f9519c..8a743ad80 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -1282,16 +1282,6 @@ __device__ float cast_value(half val) { return __half2float(val); } -template<> -__device__ half cast_value(float val) { - return __float2half(val); -} - -template<> -__device__ __nv_bfloat16 cast_value<__nv_bfloat16, float>(float val) { - return __float2bfloat16(val); -} - template<> __device__ float cast_value(__nv_bfloat16 val) { return __bfloat162float(val); @@ -2302,7 +2292,6 @@ void gpt2_backward(GPT2 *model) { // Compute a mean of a single CPU value across all GPU processes. No-op when multi-GPU is disabled. float multi_gpu_cpu_float_mean(float value, const MultiGpuConfig* multi_gpu_config) { #ifdef MULTI_GPU - if (multi_gpu_config->num_processes == 1) return value; // MPI doesn't support all reduce with mean, so we sum up, then divide. float result; mpiCheck(MPI_Allreduce(&value, &result, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD)); @@ -2315,11 +2304,11 @@ float multi_gpu_cpu_float_mean(float value, const MultiGpuConfig* multi_gpu_conf // Averages out the loss and gradients across all GPUs. No-op when multi-GPU is disabled. // todo - this version only works if all the parameters are the same size (floatX) void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) { +#ifdef MULTI_GPU NVTX_RANGE_FN(); + if (multi_gpu_config->num_processes == 1) return; // Average all losses. model->accumulated_mean_loss = multi_gpu_cpu_float_mean(model->mean_loss, multi_gpu_config); -#ifdef MULTI_GPU - if (multi_gpu_config->num_processes == 1) return; // Average all gradients. ncclCheck(ncclAllReduce(model->grads_memory, model->grads_memory, model->num_parameters, From f613ce895b30dc0b2bd1f7e81410c6a2dcdce74d Mon Sep 17 00:00:00 2001 From: chinthysl Date: Mon, 13 May 2024 09:13:02 +0000 Subject: [PATCH 061/172] Fix copy and cast params to master weights --- train_gpt2.cu | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index 8a743ad80..830e644e6 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -2353,32 +2353,32 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo void gpt2_multi_gpu_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t, MultiGpuConfig* multi_gpu_config) { NVTX_RANGE_FN(); + size_t num_parameters = multi_gpu_config->shard_num_parameters; + floatX* params_memory = (floatX*)model->params_memory + multi_gpu_config->shard_offset; + floatX* grads_memory = (floatX*)model->grads_memory + multi_gpu_config->shard_offset; + if (model->m_memory == NULL) { - cudaCheck(cudaMalloc((void**)&model->m_memory, multi_gpu_config->shard_num_parameters * sizeof(float))); - cudaCheck(cudaMalloc((void**)&model->v_memory, multi_gpu_config->shard_num_parameters* sizeof(float))); - cudaCheck(cudaMemset(model->m_memory, 0, multi_gpu_config->shard_num_parameters * sizeof(float))); - cudaCheck(cudaMemset(model->v_memory, 0, multi_gpu_config->shard_num_parameters * sizeof(float))); - printf0("allocated %zu MiB for AdamW optimizer state m\n", (multi_gpu_config->shard_num_parameters * sizeof(float)) >> 20); - printf0("allocated %zu MiB for AdamW optimizer state v\n", (multi_gpu_config->shard_num_parameters * sizeof(float)) >> 20); + cudaCheck(cudaMalloc((void**)&model->m_memory, num_parameters * sizeof(float))); + cudaCheck(cudaMalloc((void**)&model->v_memory, num_parameters * sizeof(float))); + cudaCheck(cudaMemset(model->m_memory, 0, num_parameters * sizeof(float))); + cudaCheck(cudaMemset(model->v_memory, 0, num_parameters * sizeof(float))); + printf0("allocated %zu MiB for AdamW optimizer state m\n", (num_parameters * sizeof(float)) >> 20); + printf0("allocated %zu MiB for AdamW optimizer state v\n", (num_parameters * sizeof(float)) >> 20); if (model->use_master_weights == 1) { - cudaCheck(cudaMalloc((void**)&model->master_weights, multi_gpu_config->shard_num_parameters * sizeof(float))); - copy_and_cast_kernel<<shard_num_parameters, 512), 512, 0, main_stream>>>( - model->master_weights, (floatX*)model->params_memory, multi_gpu_config->shard_num_parameters); + cudaCheck(cudaMalloc((void**)&model->master_weights, num_parameters * sizeof(float))); + copy_and_cast_kernel<<>>(model->master_weights, params_memory, num_parameters); cudaCheck(cudaGetLastError()); - printf0("allocated %zu MiB for master copy of params\n", (multi_gpu_config->shard_num_parameters * sizeof(float)) >> 20); + printf0("allocated %zu MiB for master copy of params\n", (num_parameters * sizeof(float)) >> 20); } } - floatX* params_memory = (floatX*)model->params_memory + multi_gpu_config->shard_offset; - floatX* grads_memory = (floatX*)model->grads_memory + multi_gpu_config->shard_offset; - int block_size = 512; - int num_blocks = CEIL_DIV(multi_gpu_config->shard_num_parameters, block_size); + int num_blocks = CEIL_DIV(num_parameters, block_size); float beta1_correction = 1.0f - powf(beta1, t); float beta2_correction = 1.0f - powf(beta2, t); unsigned int seed = random_u32(&model->rng_state); adamw_kernel3<<>>(params_memory, model->master_weights, grads_memory, - model->m_memory, model->v_memory, multi_gpu_config->shard_num_parameters, + model->m_memory, model->v_memory, num_parameters, learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed); cudaCheck(cudaGetLastError()); } From c0329ebdba7c33b3703d359dc4f214ae986479c1 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Mon, 13 May 2024 17:15:48 +0300 Subject: [PATCH 062/172] new kernel version with fewer atomics --- dev/cuda/matmul_backward_bias.cu | 82 ++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu index 024eca4d6..9bd4b0428 100644 --- a/dev/cuda/matmul_backward_bias.cu +++ b/dev/cuda/matmul_backward_bias.cu @@ -243,6 +243,73 @@ __global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, i } } +// We want to decrease the amount of channels handled by each block, so that we need fewer across-block reductions. +// We do this by realizing the following: For scalar memory access, we need to read one element per thread in a warp +// to read an entire cacheline, but for vectorized memory access, with 128 bit of data per thread, we only need eight +// threads to fetch a cacheline, which means that we can already operate on a "depth" of four within a single warp. +// => blockDim.x == 4, blockDim.y == 32/4 = 8 +// +__global__ void matmul_backward_bias_kernel8(float* dbias, const floatX* dout, int B, int T, int OC) { + constexpr const int bdx = 4; + constexpr const int bdy = 32 / bdx; + assert(blockDim.x == bdx); + assert(blockDim.y == bdy); + + int warp_d = (int)threadIdx.x; + int warp_c = (int)threadIdx.y; + int block_d = (int)threadIdx.z; + + const int OC_per_warp = bdy * x128::size; // 256 at BF16 + + int local_oc = warp_c * x128::size; + int global_oc = blockIdx.x * OC_per_warp + local_oc; + + int local_bt = warp_d + bdx * block_d; + int bt_per_block = bdx * blockDim.z; + + float accumulators[x128::size]; + for (int k = 0; k < x128::size; k++) { + accumulators[k] = 0.0f; + } + + if(global_oc < OC) { + // sum up over all bt within registers + for (int idx = blockIdx.y * bt_per_block + local_bt; idx < B * T; idx += gridDim.y * bt_per_block) { + x128 packed_dout = load128(dout + global_oc + idx*OC); + for (int k = 0; k < x128::size; k++) { + accumulators[k] += (float)packed_dout[k]; + } + } + } + + __shared__ float sub_results[x128::size][32][bdy]; + + // reduce within-warp results + for (int k = 0; k < x128::size; k++) { + float v = accumulators[k]; + v += __shfl_down_sync(0xffffffff, v, 1, 4); + v += __shfl_down_sync(0xffffffff, v, 2, 4); + if(warp_d == 0) { + sub_results[k][block_d][warp_c] = v; + } + } + __syncthreads(); + + // block-wide reductions + for (int k = block_d; k < x128::size; k += blockDim.z) { + float a = 0.f; + for (int r = warp_d; r < blockDim.z; r += bdx) { + float v = sub_results[k][r][warp_c]; + v += __shfl_down_sync(0xffffffff, v, 1, 4); + v += __shfl_down_sync(0xffffffff, v, 2, 4); + a += v; + } + if(warp_d == 0 && global_oc < OC) { + // coalesced, but not cacheline-sized + atomicAdd(dbias + global_oc + k, a); + } + } +} // ---------------------------------------------------------------------------- // kernel launcher @@ -309,6 +376,18 @@ void matmul_backward_bias7(floatX* dbias, floatX* dout, cast_and_add_kernel<<>>(dbias, dbias_buffer, OC); } +void matmul_backward_bias8(floatX* dbias, floatX* dout, + int B, int T, int C, int OC, int block_size) { + dim3 block_dim = {4, 8, (unsigned)block_size/32}; + const int OC_per_warp = block_dim.y * x128::size; // 64 at BF16 + const int grid_size_x = ceil_div(OC, OC_per_warp); // e.g. 12 horizontal blocks for 768 OCs at BF16 + const int grid_size_y = max(1, cuda_threads_per_SM * cuda_num_SMs / (block_size * grid_size_x)); // full GPU! + + cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float)); + matmul_backward_bias_kernel8<<>>(dbias_buffer, dout, B, T, OC); + cast_and_add_kernel<<>>(dbias, dbias_buffer, OC); +} + void matmul_backward_bias(int kernel_num, floatX* dbias, floatX* dout, int B, int T, int C, int OC, int block_size) { switch (kernel_num) { @@ -330,6 +409,9 @@ void matmul_backward_bias(int kernel_num, floatX* dbias, floatX* dout, case 7: matmul_backward_bias7(dbias, dout, B, T, C, OC, block_size); break; + case 8: + matmul_backward_bias8(dbias, dout, B, T, C, OC, block_size); + break; default: printf("Invalid kernel number\n"); exit(1); From 081d224b21c5991d548b5e5acf2fcfd96901036f Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Mon, 13 May 2024 17:39:32 +0300 Subject: [PATCH 063/172] automatically switch to buffer-less version if that can fill up the GPU --- dev/cuda/matmul_backward_bias.cu | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu index 9bd4b0428..3d5484df2 100644 --- a/dev/cuda/matmul_backward_bias.cu +++ b/dev/cuda/matmul_backward_bias.cu @@ -249,7 +249,8 @@ __global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, i // threads to fetch a cacheline, which means that we can already operate on a "depth" of four within a single warp. // => blockDim.x == 4, blockDim.y == 32/4 = 8 // -__global__ void matmul_backward_bias_kernel8(float* dbias, const floatX* dout, int B, int T, int OC) { +template +__global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout, int B, int T, int OC) { constexpr const int bdx = 4; constexpr const int bdy = 32 / bdx; assert(blockDim.x == bdx); @@ -306,7 +307,11 @@ __global__ void matmul_backward_bias_kernel8(float* dbias, const floatX* dout, i } if(warp_d == 0 && global_oc < OC) { // coalesced, but not cacheline-sized - atomicAdd(dbias + global_oc + k, a); + if constexpr (std::is_same_v) { + dbias[global_oc + k] = a; + } else { + atomicAdd(dbias + global_oc + k, a); + } } } } @@ -383,9 +388,15 @@ void matmul_backward_bias8(floatX* dbias, floatX* dout, const int grid_size_x = ceil_div(OC, OC_per_warp); // e.g. 12 horizontal blocks for 768 OCs at BF16 const int grid_size_y = max(1, cuda_threads_per_SM * cuda_num_SMs / (block_size * grid_size_x)); // full GPU! - cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float)); - matmul_backward_bias_kernel8<<>>(dbias_buffer, dout, B, T, OC); - cast_and_add_kernel<<>>(dbias, dbias_buffer, OC); + // If we have enough OC that we don't need cross-block reductions, we can skip the bias_buffer accumulation + // and write results directly to the output. + if(grid_size_y == 1) { + matmul_backward_bias_kernel8<<>>(dbias, dout, B, T, OC); + } else { + cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float)); + matmul_backward_bias_kernel8<<>>(dbias_buffer, dout, B, T, OC); + cast_and_add_kernel<<>>(dbias, dbias_buffer, OC); + } } void matmul_backward_bias(int kernel_num, floatX* dbias, floatX* dout, @@ -457,7 +468,7 @@ int main(int argc, char **argv) { // matmul_backward_bias(kernel_num, NULL, NULL, d_dbias, d_dout, NULL, NULL, NULL, B, T, C, OC, block_size_debug); // exit(EXIT_SUCCESS); - int block_sizes[] = {32, 64, 128, 256, 512, 1024}; + int block_sizes[] = {32, 64, 128, 256, 512, 768, 1024}; // calculate the CPU reference matmul_backward_bias_cpu(NULL, NULL, dbias, dout, NULL, NULL, B, T, C, OC); From aa41b3262480f2288e80d7f3f55efb03cdef2701 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Mon, 13 May 2024 17:48:25 +0300 Subject: [PATCH 064/172] update main file --- train_gpt2.cu | 130 +++++++++++++++++++++++++++----------------------- 1 file changed, 70 insertions(+), 60 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index 1c6cdb136..5a08cd249 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -865,57 +865,70 @@ __global__ void gelu_backward_kernel(floatX* dinp, const floatX* inp, const floa store128(dinp + idx, packed_dinp); } -__global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, int B, int T, int OC) { - // note: this kernel reads in floatX, but it writes to float! - // this is because we're using atomics, which are super slow in < fp32 precision on < H100 GPUs - // so the trick is do fp32 atomics to a buffer, and then copy_and_cast the result to floatX - // (this also results in higher accuracy than doing accumulation directly in floatX) +template +__global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout, int B, int T, int OC) { + constexpr const int bdx = 4; + constexpr const int bdy = 32 / bdx; + assert(blockDim.x == bdx); + assert(blockDim.y == bdy); - // see comments in matmul_backward() for an explanation of block/grid dimensions etc. - const int block_size = 512; - const int block_size_x = 32; - const int block_size_y = block_size / block_size_x; // 16 - const int OC_per_warp = block_size_x * x128::size; // 256 at BF16 + int warp_d = (int)threadIdx.x; + int warp_c = (int)threadIdx.y; + int block_d = (int)threadIdx.z; + + const int OC_per_warp = bdy * x128::size; // 256 at BF16 - int local_oc = threadIdx.x * x128::size; + int local_oc = warp_c * x128::size; int global_oc = blockIdx.x * OC_per_warp + local_oc; - float accumulators[x128::size]; - __shared__ float shared[OC_per_warp]; + int local_bt = warp_d + bdx * block_d; + int bt_per_block = bdx * blockDim.z; + + float accumulators[x128::size]; for (int k = 0; k < x128::size; k++) { accumulators[k] = 0.0f; } - int thread_id = threadIdx.y * block_size_x + threadIdx.x; - for (int idx = thread_id; idx < OC_per_warp; idx += block_size) { - shared[idx] = 0.0f; - } - __syncthreads(); + if(global_oc < OC) { - for (int idx = blockIdx.y*block_size_y + threadIdx.y; idx < B * T; idx += gridDim.y*block_size_y) { + // sum up over all bt within registers + for (int idx = blockIdx.y * bt_per_block + local_bt; idx < B * T; idx += gridDim.y * bt_per_block) { x128 packed_dout = load128(dout + global_oc + idx*OC); for (int k = 0; k < x128::size; k++) { accumulators[k] += (float)packed_dout[k]; } - } - // we need to avoid shared memory bank conflicts for the atomicAdd to maximise performance, - // so we accumulate in a conflict-free order, then reorder to match the global memory order - for (int k = 0; k < x128::size; k++) { - atomicAdd(shared + threadIdx.x + (k * block_size_x), accumulators[k]); - } - } - if (threadIdx.y >= x128::size) { return; } // only need this many warps to reorder the data - __syncthreads(); - // read the accumulated values in the conflict-free order - int i = threadIdx.x + (threadIdx.y * block_size_x); - float tmp = shared[i]; - __syncthreads(); - // write them back to shared memory in the global memory order - // 8-way bank conflict for BF16 x128, but only 8x per threadblock (rather than 8x per warp) - shared[local_oc + threadIdx.y] = tmp; + } + } + + __shared__ float sub_results[x128::size][32][bdy]; + + // reduce within-warp results + for (int k = 0; k < x128::size; k++) { + float v = accumulators[k]; + v += __shfl_down_sync(0xffffffff, v, 1, 4); + v += __shfl_down_sync(0xffffffff, v, 2, 4); + if(warp_d == 0) { + sub_results[k][block_d][warp_c] = v; + } + } __syncthreads(); - // now we do a perfectly coalesced atomic add to global memory (1x 128-byte cacheline per warp) - if (i + blockIdx.x*OC_per_warp < OC) { - atomicAdd(dbias + i + blockIdx.x*OC_per_warp, shared[i]); + + // block-wide reductions + for (int k = block_d; k < x128::size; k += blockDim.z) { + float a = 0.f; + for (int r = warp_d; r < blockDim.z; r += bdx) { + float v = sub_results[k][r][warp_c]; + v += __shfl_down_sync(0xffffffff, v, 1, 4); + v += __shfl_down_sync(0xffffffff, v, 2, 4); + a += v; + } + if(warp_d == 0 && global_oc < OC) { + // coalesced, but not cacheline-sized + if constexpr (std::is_same_v) { + dbias[global_oc + k] = a; + } else { + atomicAdd(dbias + global_oc + k, a); + } + } } } @@ -1462,28 +1475,25 @@ void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias, // backward to bias, if given, does a += if (dbias != NULL) { - // Each warp is responsible for 32 * "x128::size" = 256 OCs at BF16 (OC must be a multiple of 256!) - // Block size is 512 threads (16 warps) and we reduce those 16 values into 1 at the end - // blockDim.x is 32 --> single warp being responsible for those 256 OCs - // blockDim.y is 16 --> 16 parallel independent warps processing the same OCs for different BTs - // gridDim.x is OC / 256 --> each block processes 256 OCs - // grimDim.y is max(1, (cuda_num_SMs * threads_per_SM) / (512 * gridDim.x)); --> fill up the entire GPU! - const int warp_size = 32; - const int block_size = 512; - const int OC_per_warp = warp_size * x128::size; // 256 at BF16 - const int block_size_x = 32; - const int block_size_y = block_size / block_size_x; // 16 - const int grid_size_x = CEIL_DIV(OC, OC_per_warp); // e.g. 3 horizontal blocks for 768 OCs at BF16 - const int grid_size_y = max(1, deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount - / (block_size * grid_size_x)); // full GPU! - - assert(block_size_y >= x128::size); // part of the kernel assumes this is large enough to avoid loops - - cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float), main_stream); - matmul_backward_bias_kernel7<<>>(dbias_buffer, dout, B, T, OC); - cast_and_add_kernel<<>>(dbias, dbias_buffer, OC); + // Each warp is responsible for 8 * "x128::size" = 64 OCs at BF16 (OC must be a multiple of 64!) + // Block size is 1024 | 768 threads (32|24 warps) and we reduce those values into 1 at the end + + const int block_size = deviceProp.maxThreadsPerMultiProcessor == 1536 ? 768 : 1024; + + dim3 block_dim = {4, 8, (unsigned)block_size/32}; + const int OC_per_warp = block_dim.y * x128::size; // 64 at BF16 + const int grid_size_x = CEIL_DIV(OC, OC_per_warp); // e.g. 12 horizontal blocks for 768 OCs at BF16 + const int grid_size_y = max(1, deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount / (block_size * grid_size_x)); // full GPU! + + // If we have enough OC that we don't need cross-block reductions, we can skip the bias_buffer accumulation + // and write results directly to the output. + if(grid_size_y == 1) { + matmul_backward_bias_kernel8<<>>(dbias, dout, B, T, OC); + } else { + cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float), main_stream); + matmul_backward_bias_kernel8<<>>(dbias_buffer, dout, B, T, OC); + cast_and_add_kernel<<>>(dbias, dbias_buffer, OC); + } } // backward to input, uses = in the backward pass (set the gradient) From 49ee3c830773903215279bdd4840d80b8df1dd1d Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Mon, 13 May 2024 18:27:56 +0300 Subject: [PATCH 065/172] fix non-atomic version: * accumulate instead of assign * need dedicated argument to correctly handle the floatX == float case --- dev/cuda/matmul_backward_bias.cu | 14 ++++++++------ train_gpt2.cu | 21 ++++++++++++++------- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu index 3d5484df2..155c20098 100644 --- a/dev/cuda/matmul_backward_bias.cu +++ b/dev/cuda/matmul_backward_bias.cu @@ -22,6 +22,7 @@ sudo ncu --set full --import-source yes -o bias -f ./matmul_backward_bias 1 #include #include #include +#include #define ENABLE_BF16 #include "common.h" @@ -249,8 +250,9 @@ __global__ void matmul_backward_bias_kernel7(float* dbias, const floatX* dout, i // threads to fetch a cacheline, which means that we can already operate on a "depth" of four within a single warp. // => blockDim.x == 4, blockDim.y == 32/4 = 8 // -template -__global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout, int B, int T, int OC) { +template +__global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout, int B, int T, int OC, + std::bool_constant) { constexpr const int bdx = 4; constexpr const int bdy = 32 / bdx; assert(blockDim.x == bdx); @@ -307,8 +309,8 @@ __global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout } if(warp_d == 0 && global_oc < OC) { // coalesced, but not cacheline-sized - if constexpr (std::is_same_v) { - dbias[global_oc + k] = a; + if constexpr (!Atomic) { + dbias[global_oc + k] = (OutFloat)(a + (float)dbias[global_oc + k]); } else { atomicAdd(dbias + global_oc + k, a); } @@ -391,10 +393,10 @@ void matmul_backward_bias8(floatX* dbias, floatX* dout, // If we have enough OC that we don't need cross-block reductions, we can skip the bias_buffer accumulation // and write results directly to the output. if(grid_size_y == 1) { - matmul_backward_bias_kernel8<<>>(dbias, dout, B, T, OC); + matmul_backward_bias_kernel8<<>>(dbias, dout, B, T, OC, std::bool_constant{}); } else { cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float)); - matmul_backward_bias_kernel8<<>>(dbias_buffer, dout, B, T, OC); + matmul_backward_bias_kernel8<<>>(dbias_buffer, dout, B, T, OC, std::bool_constant{}); cast_and_add_kernel<<>>(dbias, dbias_buffer, OC); } } diff --git a/train_gpt2.cu b/train_gpt2.cu index 5a08cd249..5a553eda9 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -865,8 +865,13 @@ __global__ void gelu_backward_kernel(floatX* dinp, const floatX* inp, const floa store128(dinp + idx, packed_dinp); } -template -__global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout, int B, int T, int OC) { +// templated because if we have enough channels, we can write directly to the bf16 dbias buffer, and otherwise +// we need to write to a fp32 temp buffer. The `Atomic` argument indicates whether we add atomically. We cannot +// (easily) use a regular runtime `if(blockDim.y == 1)` runtime condition, because that doesn't compile for older +// GPUs. +template +__global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout, int B, int T, int OC, + std::bool_constant) { constexpr const int bdx = 4; constexpr const int bdy = 32 / bdx; assert(blockDim.x == bdx); @@ -921,10 +926,12 @@ __global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout v += __shfl_down_sync(0xffffffff, v, 2, 4); a += v; } + + // coalesced, but not cacheline-sized writes if(warp_d == 0 && global_oc < OC) { - // coalesced, but not cacheline-sized - if constexpr (std::is_same_v) { - dbias[global_oc + k] = a; + // if we have only one block per result, no need for atomics + if constexpr (!Atomic) { + dbias[global_oc + k] = (OutFloat)(a + (float)dbias[global_oc + k]); } else { atomicAdd(dbias + global_oc + k, a); } @@ -1488,10 +1495,10 @@ void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias, // If we have enough OC that we don't need cross-block reductions, we can skip the bias_buffer accumulation // and write results directly to the output. if(grid_size_y == 1) { - matmul_backward_bias_kernel8<<>>(dbias, dout, B, T, OC); + matmul_backward_bias_kernel8<<>>(dbias, dout, B, T, OC, std::bool_constant{}); } else { cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float), main_stream); - matmul_backward_bias_kernel8<<>>(dbias_buffer, dout, B, T, OC); + matmul_backward_bias_kernel8<<>>(dbias_buffer, dout, B, T, OC, std::bool_constant{}); cast_and_add_kernel<<>>(dbias, dbias_buffer, OC); } } From 65727d5a4da65eebe9b5dbee01201f9eac492b93 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Mon, 13 May 2024 19:20:03 +0300 Subject: [PATCH 066/172] fix CI compile by disabling kernel 5 --- dev/cuda/matmul_backward_bias.cu | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu index 155c20098..66a59801f 100644 --- a/dev/cuda/matmul_backward_bias.cu +++ b/dev/cuda/matmul_backward_bias.cu @@ -172,6 +172,7 @@ __global__ void matmul_backward_bias_kernel4(floatX* dbias, const floatX* dout, } } +#ifndef ENABLE_BF16 __global__ void matmul_backward_bias_kernel5(floatX* dbias, const floatX* dout, int B, int T, int OC) { int oc = blockIdx.x * blockDim.x + threadIdx.x; if(oc >= OC) return; @@ -183,6 +184,7 @@ __global__ void matmul_backward_bias_kernel5(floatX* dbias, const floatX* dout, // and atomically add everything together. atomics within one block are conflict-free! atomicAdd(dbias + oc, sum); } +#endif __global__ void cast_and_add_kernel(floatX* dst, const float* src, size_t n) { @@ -350,12 +352,14 @@ void matmul_backward_bias4(floatX* dbias, floatX* dout, matmul_backward_bias_kernel4<<>>(dbias, dout, B, T, OC); } +#ifndef ENABLE_BF16 void matmul_backward_bias5(floatX* dbias, floatX* dout, int B, int T, int C, int OC, int block_size) { const int grid_size_x = ceil_div(OC, block_size); const int grid_size_y = max(1, cuda_threads_per_SM * cuda_num_SMs / block_size); matmul_backward_bias_kernel5<<>>(dbias, dout, B, T, OC); } +#endif void matmul_backward_bias7(floatX* dbias, floatX* dout, int B, int T, int C, int OC, int block_size) { @@ -417,7 +421,12 @@ void matmul_backward_bias(int kernel_num, floatX* dbias, floatX* dout, matmul_backward_bias4(dbias, dout, B, T, C, OC, block_size); break; case 5: +#ifndef ENABLE_BF16 matmul_backward_bias5(dbias, dout, B, T, C, OC, block_size); +#else + fprintf(stderr, "Kernel 5 is only supported for fp32"); + exit(1); +#endif break; case 7: matmul_backward_bias7(dbias, dout, B, T, C, OC, block_size); From c66e48c06c1a64b5de55fd37bd67531f8fcbcc85 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Mon, 13 May 2024 20:58:07 +0300 Subject: [PATCH 067/172] fixup comment --- dev/cuda/matmul_backward_bias.cu | 4 ++-- train_gpt2.cu | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu index 66a59801f..0bf5e44dd 100644 --- a/dev/cuda/matmul_backward_bias.cu +++ b/dev/cuda/matmul_backward_bias.cu @@ -256,7 +256,7 @@ template __global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout, int B, int T, int OC, std::bool_constant) { constexpr const int bdx = 4; - constexpr const int bdy = 32 / bdx; + constexpr const int bdy = 32 / bdx; assert(blockDim.x == bdx); assert(blockDim.y == bdy); @@ -264,7 +264,7 @@ __global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout int warp_c = (int)threadIdx.y; int block_d = (int)threadIdx.z; - const int OC_per_warp = bdy * x128::size; // 256 at BF16 + const int OC_per_warp = bdy * x128::size; // 64 at BF16 int local_oc = warp_c * x128::size; int global_oc = blockIdx.x * OC_per_warp + local_oc; diff --git a/train_gpt2.cu b/train_gpt2.cu index 5a553eda9..dd98ea000 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -873,7 +873,7 @@ template __global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout, int B, int T, int OC, std::bool_constant) { constexpr const int bdx = 4; - constexpr const int bdy = 32 / bdx; + constexpr const int bdy = 32 / bdx; assert(blockDim.x == bdx); assert(blockDim.y == bdy); @@ -881,7 +881,7 @@ __global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout int warp_c = (int)threadIdx.y; int block_d = (int)threadIdx.z; - const int OC_per_warp = bdy * x128::size; // 256 at BF16 + const int OC_per_warp = bdy * x128::size; // 64 at BF16 int local_oc = warp_c * x128::size; int global_oc = blockIdx.x * OC_per_warp + local_oc; From dd8c9f5ec9bf268cd0ed562a4f07214d0bfa1199 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Tue, 14 May 2024 20:43:51 +0300 Subject: [PATCH 068/172] fix layernorm backward: accumulate weight gradient --- dev/cuda/layernorm_backward.cu | 10 ++++++---- train_gpt2.cu | 10 ++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/dev/cuda/layernorm_backward.cu b/dev/cuda/layernorm_backward.cu index 575e0a962..e22155247 100644 --- a/dev/cuda/layernorm_backward.cu +++ b/dev/cuda/layernorm_backward.cu @@ -842,11 +842,13 @@ __global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS) int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration); int shared_index = warpThreadIdx + (i * C_per_iteration); - x128 dbias128; - x128 dweight128; + x128 dbias128 = load128(dbias + global_index); + x128 dweight128 = load128(dweight + global_index); for (int x = 0; x < x128::size; x++) { - dbias128[x] = (floatX)scratch_dbias[shared_index + x*warpSize]; - dweight128[x] = (floatX)scratch_dweight[shared_index + x*warpSize]; + float s_db = scratch_dbias[shared_index + x*warpSize]; + float s_dw = scratch_dweight[shared_index + x*warpSize]; + dbias128[x] = (floatX)(s_db + (float)dbias128[x]); + dweight128[x] = (floatX)(s_dw + (float)dweight128[x]); } store128(dbias + global_index, dbias128); store128(dweight + global_index, dweight128); diff --git a/train_gpt2.cu b/train_gpt2.cu index 3285aa1d5..469a0d6ac 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -1029,11 +1029,13 @@ __global__ void __launch_bounds__(512, 3) // todo - any warnings on Turing with int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration); int shared_index = warpThreadIdx + (i * C_per_iteration); - x128 dbias128; - x128 dweight128; + x128 dbias128 = load128(dbias + global_index); + x128 dweight128 = load128(dweight + global_index); for (int x = 0; x < x128::size; x++) { - dbias128[x] = (floatX)scratch_dbias[shared_index + x*warpSize]; - dweight128[x] = (floatX)scratch_dweight[shared_index + x*warpSize]; + float s_db = scratch_dbias[shared_index + x*warpSize]; + float s_dw = scratch_dweight[shared_index + x*warpSize]; + dbias128[x] = (floatX)(s_db + (float)dbias128[x]); + dweight128[x] = (floatX)(s_dw + (float)dweight128[x]); } store128(dbias + global_index, dbias128); store128(dweight + global_index, dweight128); From e553e2f084b29bbf7a59006de593ac311d60fc19 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Tue, 14 May 2024 20:44:50 +0300 Subject: [PATCH 069/172] update dev/cuda/layernorm_backward and improve `validate_result` to take into account fp epsilon when comparing results --- dev/cuda/common.h | 22 +++++++++- dev/cuda/layernorm_backward.cu | 76 +++++++++++++--------------------- 2 files changed, 49 insertions(+), 49 deletions(-) diff --git a/dev/cuda/common.h b/dev/cuda/common.h index 0c2079821..2757c67b5 100644 --- a/dev/cuda/common.h +++ b/dev/cuda/common.h @@ -3,6 +3,7 @@ #include #include #include +#include template @@ -260,13 +261,25 @@ void validate_result(D* device_result, const T* cpu_reference, const char* name, D* out_gpu = (D*)malloc(num_elements * sizeof(D)); cudaCheck(cudaMemcpy(out_gpu, device_result, num_elements * sizeof(D), cudaMemcpyDeviceToHost)); int nfaults = 0; +#ifndef ENABLE_BF16 + float epsilon = FLT_EPSILON; +#else + float epsilon = 0.079; +#endif for (int i = 0; i < num_elements; i++) { + // Skip masked elements + if(!isfinite(cpu_reference[i])) + continue; + // print the first few comparisons if (i < 5) { printf("%f %f\n", cpu_reference[i], (T)out_gpu[i]); } - // ensure correctness for all elements. We can set an "ignore" mask by writing NaN - if (fabs(cpu_reference[i] - (T)out_gpu[i]) > tolerance && isfinite(cpu_reference[i])) { + // effective tolerance is based on expected rounding error (epsilon), + // plus any specified additional tolerance + float t_eff = tolerance + fabs(cpu_reference[i]) * epsilon; + // ensure correctness for all elements. + if (fabs(cpu_reference[i] - (T)out_gpu[i]) > t_eff) { printf("Mismatch of %s at %d: CPU_ref: %f vs GPU: %f\n", name, i, cpu_reference[i], (T)out_gpu[i]); nfaults ++; if (nfaults >= 10) { @@ -276,6 +289,11 @@ void validate_result(D* device_result, const T* cpu_reference, const char* name, } } + if (nfaults > 0) { + free(out_gpu); + exit(EXIT_FAILURE); + } + // reset the result pointer, so we can chain multiple tests and don't miss trivial errors, // like the kernel not writing to part of the result. // cudaMemset(device_result, 0, num_elements * sizeof(T)); diff --git a/dev/cuda/layernorm_backward.cu b/dev/cuda/layernorm_backward.cu index e22155247..90dcb1674 100644 --- a/dev/cuda/layernorm_backward.cu +++ b/dev/cuda/layernorm_backward.cu @@ -1014,25 +1014,6 @@ int main(int argc, char **argv) { float *dbias = make_zeros_float(C); layernorm_backward_cpu(dinp, dweight, dbias, dout, inp, weight, mean, rstd, B, T, C); - // convert all the necessary cpu data to floatX (e.g. bfloat16) - floatX* meanX = (floatX*)malloc(B * T * sizeof(floatX)); - floatX* rstdX = (floatX*)malloc(B * T * sizeof(floatX)); - floatX* doutX = (floatX*)malloc(B * T * C * sizeof(floatX)); - floatX* inpX = (floatX*)malloc(B * T * C * sizeof(floatX)); - floatX* weightX = (floatX*)malloc(C * sizeof(floatX)); - - for (int i = 0; i < B * T; i++) { - meanX[i] = (floatX)mean[i]; - rstdX[i] = (floatX)rstd[i]; - } - for (int i = 0; i < B * T * C; i++) { - doutX[i] = (floatX)dout[i]; - inpX[i] = (floatX)inp[i]; - } - for (int i = 0; i < C; i++) { - weightX[i] = (floatX)weight[i]; - } - // the above calculations act as the reference // now let's do the same on the GPU @@ -1063,33 +1044,39 @@ int main(int argc, char **argv) { cudaCheck(cudaMalloc(&d_rstd, B * T * sizeof(floatX))); cudaCheck(cudaMalloc(&d_scratch, cuda_num_SMs * (2 * C + 1) * sizeof(float))); // copy over the "inputs" to the backward call - cudaCheck(cudaMemcpy(d_dout, doutX, B * T * C * sizeof(floatX), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(d_inp, inpX, B * T * C * sizeof(floatX), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(d_weight, weightX, C * sizeof(floatX), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(d_mean, meanX, B * T * sizeof(floatX), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(d_rstd, rstdX, B * T * sizeof(floatX), cudaMemcpyHostToDevice)); - // init the "outputs" of the backward call to zeros - cudaCheck(cudaMemset(d_dinp, 0, B * T * C * sizeof(floatX))); - cudaCheck(cudaMemset(d_dweight, 0, C * sizeof(floatX))); - cudaCheck(cudaMemset(d_dbias, 0, C * sizeof(floatX))); + cudaCheck(memcpy_convert(d_dout, dout, B * T * C)); + cudaCheck(memcpy_convert(d_inp, inp, B * T * C)); + cudaCheck(memcpy_convert(d_weight, weight, C)); + cudaCheck(memcpy_convert(d_mean, mean, B * T)); + cudaCheck(memcpy_convert(d_rstd, rstd, B * T)); // launch the kernel - const int block_size = 256; - layernorm_backward(kernel_num, d_dinp, d_dweight, d_dbias, d_scratch, d_dout, d_inp, d_weight, d_mean, d_rstd, B, T, C, block_size); - - // check the correctness of the kernel - float error_threshold_dinp = sizeof(floatX) == 4 ? 1e-3f : 1e-1f; // allow larger errors for BF16/FP16 - float error_threshold_dparams = sizeof(floatX) == 4 ? 1e-3f : 20.0f; // much, much larger... - printf("Checking correctness...\n"); - printf("dinp:\n"); - validate_result(d_dinp, dinp, "dinp", B * T * C, error_threshold_dinp); - printf("dweight:\n"); - validate_result(d_dweight, dweight, "dweight", C, error_threshold_dparams); - printf("dbias:\n"); - validate_result(d_dbias, dbias, "dbias", C, error_threshold_dparams); + int block_sizes[] = {32, 64, 128, 256, 512, 768, 1024}; + for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) { + int block_size = block_sizes[j]; + // init the "outputs" of the backward call to zeros + cudaCheck(cudaMemset(d_dinp, 0, B * T * C * sizeof(floatX))); + cudaCheck(cudaMemset(d_dweight, 0, C * sizeof(floatX))); + cudaCheck(cudaMemset(d_dbias, 0, C * sizeof(floatX))); + + layernorm_backward(kernel_num, d_dinp, d_dweight, d_dbias, d_scratch, d_dout, d_inp, d_weight, d_mean, d_rstd, + B, T, C, block_size); + + // check the correctness of the kernel + float error_threshold_dinp = sizeof(floatX) == 4 ? 1e-3f : 1e-1f; // allow larger errors for BF16/FP16 + float error_threshold_dparams = sizeof(floatX) == 4 ? 1e-3f : 5e-1f; // much, much larger... + printf("Checking correctness...\n"); + printf("dinp:\n"); + validate_result(d_dinp, dinp, "dinp", B * T * C, error_threshold_dinp); + printf("dweight:\n"); + validate_result(d_dweight, dweight, "dweight", C, error_threshold_dparams); + printf("dbias:\n"); + validate_result(d_dbias, dbias, "dbias", C, error_threshold_dparams); + + printf("All results match for block_size=%d.\n\n", block_size); + } // now time the kernel - int block_sizes[] = {32, 64, 128, 256, 512, 1024}; for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) { int block_size = block_sizes[j]; int repeat_times = 100; @@ -1110,11 +1097,6 @@ int main(int argc, char **argv) { free(dinp); free(dweight); free(dbias); - free(meanX); - free(rstdX); - free(doutX); - free(inpX); - free(weightX); cudaCheck(cudaFree(d_dinp)); cudaCheck(cudaFree(d_dweight)); cudaCheck(cudaFree(d_dbias)); From 2d43e5bc977bac537ae1f6aff7de978fba328409 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 14 May 2024 19:14:07 +0000 Subject: [PATCH 070/172] remove legacy comment --- dev/cuda/common.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/dev/cuda/common.h b/dev/cuda/common.h index 2757c67b5..f78e140a5 100644 --- a/dev/cuda/common.h +++ b/dev/cuda/common.h @@ -294,11 +294,6 @@ void validate_result(D* device_result, const T* cpu_reference, const char* name, exit(EXIT_FAILURE); } - // reset the result pointer, so we can chain multiple tests and don't miss trivial errors, - // like the kernel not writing to part of the result. - // cudaMemset(device_result, 0, num_elements * sizeof(T)); - // AK: taking this out, ~2 hours of my life was spent finding this line - free(out_gpu); } From 3b5933ecfb9dca85e5663effdb662092aac11a7f Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Tue, 14 May 2024 22:32:24 +0300 Subject: [PATCH 071/172] considerably speed up CPU matmul while still keeping it relatively readable --- train_gpt2.c | 70 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 57 insertions(+), 13 deletions(-) diff --git a/train_gpt2.c b/train_gpt2.c index 9706a2c0b..06cdfbb54 100644 --- a/train_gpt2.c +++ b/train_gpt2.c @@ -158,32 +158,76 @@ void layernorm_backward(float* dinp, float* dweight, float* dbias, } } +void matmul_forward_slow(float* out, + const float* inp, const float* weight, const float* bias, + int B, int T, int C, int OC) { + // basic implementation of matrix multiplication. This serves as a fallback + // for bad input shapes, and as an illustration for the most basic version + // of the algorithm. +#pragma omp parallel for collapse(2) + for (int b = 0; b < B; b++) { + for (int t = 0; t < T; t++) { + int bt = b * T + t; + for (int o = 0; o < OC; o++) { + float val = (bias != NULL) ? bias[o] : 0.0f; + for (int i = 0; i < C; i++) { + val += inp[bt * C + i] * weight[o*C + i]; + } + out[bt * OC + o] = val; + } + } + } +} + void matmul_forward(float* out, - float* inp, float* weight, float* bias, + const float* inp, const float* weight, const float* bias, int B, int T, int C, int OC) { // most of the running time is spent here and in matmul_backward // OC is short for "output channels" // inp is (B,T,C), weight is (OC, C), bias is (OC) // out will be (B,T,OC) - #pragma omp parallel for collapse(2) - for (int b = 0; b < B; b++) { - for (int t = 0; t < T; t++) { - float* out_bt = out + b * T * OC + t * OC; - float* inp_bt = inp + b * T * C + t * C; - for (int o = 0; o < OC; o++) { - float val = (bias != NULL) ? bias[o] : 0.0f; - float* wrow = weight + o*C; - for (int i = 0; i < C; i++) { - val += inp_bt[i] * wrow[i]; + + // make sure the tiled loop will be correct, otherwise, fallback to slow version + const int LOOP_UNROLL = 8; + if (B*T % LOOP_UNROLL != 0) { + matmul_forward_slow(out, inp, weight, bias, B, T, C, OC); + return; + } + + // collapse the B and T loops into one and turn it into a strided loop. + // then we can tile the inner loop, and reuse the loaded weight LOOP_UNROLL many times + // for significant speed-ups. + #pragma omp parallel for + for (int obt = 0; obt < B * T; obt += LOOP_UNROLL) { + for (int o = 0; o < OC; o++) { + // keep LOOP_UNROLL many results in register, initialized by the bias term. + float result[LOOP_UNROLL]; + for (int ibt = 0; ibt < LOOP_UNROLL; ++ibt) { + result[ibt] = (bias != NULL) ? bias[o] : 0.0f; + } + + // inner loops. Because we do LOOP_UNROLL steps of inner bt, we can cache + // the value of weight[i + o * C] and reuse it. + // we compile with -Ofast, so the compiler will turn the inner loop into a bunch of FMAs + for (int i = 0; i < C; i++) { + float w = weight[i + o * C]; + for (int ibt = 0; ibt < LOOP_UNROLL; ++ibt) { + int bt = obt + ibt; + result[ibt] += inp[bt * C + i] * w; } - out_bt[o] = val; + } + + // write back results to main memory + for (int ibt = 0; ibt < LOOP_UNROLL; ++ibt) { + int bt = obt + ibt; + out[bt * OC + o] = result[ibt]; } } } } void matmul_backward(float* dinp, float* dweight, float* dbias, - float* dout, float* inp, float* weight, + const float* dout, const float* inp, const float* weight, int B, int T, int C, int OC) { // most of the running time is spent here and in matmul_forward // this backward could be done in a single "round" of loops From b2a5508b84a0db561e371ef0092050e33c245a29 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Tue, 14 May 2024 22:48:59 +0300 Subject: [PATCH 072/172] constness fixes --- train_gpt2.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/train_gpt2.c b/train_gpt2.c index 06cdfbb54..0c5583e5e 100644 --- a/train_gpt2.c +++ b/train_gpt2.c @@ -237,10 +237,10 @@ void matmul_backward(float* dinp, float* dweight, float* dbias, #pragma omp parallel for collapse(2) for (int b = 0; b < B; b++) { for (int t = 0; t < T; t++) { - float* dout_bt = dout + b * T * OC + t * OC; + const float* dout_bt = dout + b * T * OC + t * OC; float* dinp_bt = dinp + b * T * C + t * C; for (int o = 0; o < OC; o++) { - float* wrow = weight + o*C; + const float* wrow = weight + o*C; float d = dout_bt[o]; for (int i = 0; i < C; i++) { dinp_bt[i] += wrow[i] * d; @@ -253,8 +253,8 @@ void matmul_backward(float* dinp, float* dweight, float* dbias, for (int o = 0; o < OC; o++) { for (int b = 0; b < B; b++) { for (int t = 0; t < T; t++) { - float* dout_bt = dout + b * T * OC + t * OC; - float* inp_bt = inp + b * T * C + t * C; + const float* dout_bt = dout + b * T * OC + t * OC; + const float* inp_bt = inp + b * T * C + t * C; float* dwrow = dweight + o*C; float d = dout_bt[o]; if (dbias != NULL) { dbias[o] += d; } From 92fc26eba4549057b40726d03e36d483e40542da Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 14 May 2024 21:32:47 +0000 Subject: [PATCH 073/172] the nuts and bolts of gradient accumulation again, merged to master, but there is a bug and it doesn't work, debugging... --- train_gpt2.cu | 86 ++++++++++++++++++++++++++++++++++----------------- train_gpt2.py | 49 ++++++++++++++++++++++------- 2 files changed, 96 insertions(+), 39 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index 78922a60b..69df89084 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -1233,7 +1233,7 @@ __device__ SoftmaxParams prepare_softmax_blockwide3(int idx, const floatX* inp, template __global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS) fused_classifier_kernel5(floatX* logits, floatX* losses, floatX* probs, - const floatX* dlosses, const int* targets, + const float dloss, const int* targets, int B, int T, int V, int P) { int idx = gridDim.x - (blockIdx.x+1); // reverse order for cache hits on matmul data int ix = targets[idx]; @@ -1247,8 +1247,6 @@ __global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS) losses[idx] = (floatX)(-logf(prob)); } - // very sensible default for dlosses is 1/(B*T), which is the uniform loss - float dloss = (dlosses != NULL) ? (float)dlosses[idx] : 1.0f / (B*T); // calculate the gradients directly, saves bandwidth from probs during training // but also supports writing probs for inference-only and debugging const floatX* logits_vec = logits + idx * P; @@ -1307,11 +1305,11 @@ __device__ float cast_value(half val) { template<> __device__ float cast_value(__nv_bfloat16 val) { return __bfloat162float(val); -} +} template __global__ void copy_and_cast_kernel(Td* dst, const Ts* src, size_t n) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; + int idx = blockIdx.x * blockDim.x + threadIdx.x; // need to try grid stride looping for more perf later if (idx < n) { dst[idx] = cast_value(src[idx]); @@ -1647,13 +1645,13 @@ void attention_backward(floatX* dinp, floatX* dqkvr, floatX* dpreatt, floatX* da // replaces logits with logit gradients template void fused_classifier(Type* logits, Type* losses, - const Type* dlosses, const int* targets, + const float dloss, const int* targets, int B, int T, int V, int P) { NVTX_RANGE_FN(); const int block_size = 1024; const int N = B * T; const int grid_size = N; - fused_classifier_kernel5<<>>(logits, losses, (Type*)NULL, dlosses, targets, B, T, V, P); + fused_classifier_kernel5<<>>(logits, losses, (floatX*)NULL, dloss, targets, B, T, V, P); cudaCheck(cudaGetLastError()); } @@ -1987,7 +1985,7 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) { model->use_master_weights = 1; // keep master weights copy in float for optim update? } -void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bool get_loss=true) { +void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bool get_loss=true, int grad_accum_steps=1) { NVTX_RANGE_FN(); // targets are optional and could be NULL // in this function we must be careful and use size_t instead of int, otherwise @@ -2133,8 +2131,8 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo // wait on memcpy of targets (definitely finished by now, but better safe than sorry) cudaStreamWaitEvent(main_stream, parallel_events[0], 0); // fused classifier: does the forward pass and first part of the backward pass - // we're passing dlosses = NULL, which will default them to 1.0f/(B*T), i.e. uniform loss - fused_classifier(acts.output, model->cpu_losses, (floatX*)NULL, model->targets, B, T, V, Vp); + const float dloss = 1.0f / (B * T * grad_accum_steps); // results in the uniform average loss over all elements + fused_classifier(acts.output, model->cpu_losses, dloss, model->targets, B, T, V, Vp); // the GPU now writes the losses directly to the CPU buffer allocated with cudaMallocHost() // we accumulate cpu_losses at the end of gpt2_backward() waiting on this event @@ -2149,9 +2147,10 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo // accumulate the loss immediately if we are not going to run gpt2_backward(), e.g. inference if (get_loss) { + assert(targets != NULL); // makes no sense to request loss if we don't have targets cudaCheck(cudaEventSynchronize(loss_event)); // hopefully finished long ago for (int i=0; imean_loss += (float)(model->cpu_losses[i]); } - model->mean_loss /= B*T; + model->mean_loss /= B*T*grad_accum_steps; } } @@ -2624,8 +2623,9 @@ void error_usage() { fprintf(stderr, " -i input dataset prefix (default = data/tiny_shakespeare)\n"); fprintf(stderr, " -e input model filename (default = gpt2_124M_bf16.bin)\n"); fprintf(stderr, " -o output log file (default = NULL)\n"); - fprintf(stderr, " -b batch size B (default = 4)\n"); + fprintf(stderr, " -b (per-GPU, micro) batch size B (default = 4)\n"); fprintf(stderr, " -t sequence length T (default = 1024)\n"); + fprintf(stderr, " -d total desired batch size (default = B * T * num_processes, i.e. no grad accumulation\n"); fprintf(stderr, " -l learning rate (default = 3e-4f)\n"); fprintf(stderr, " -x max_steps of optimization to run (-1 (default) = disable, run 1 epoch)\n"); fprintf(stderr, " -v val_loss_every, how often we evaluate val loss (default = 20)\n"); @@ -2650,6 +2650,7 @@ int main(int argc, char *argv[]) { const char* output_log_file = NULL; int B = 4; // batch size int T = 1024; // sequence length max + int total_batch_size = -1; // will be calculated down below later, if not provided float learning_rate = 3e-4f; int val_loss_every = 20; // every how many steps do we eval validation loss? int val_max_batches = 20; // how many batches max do we eval for validation loss? @@ -2668,8 +2669,9 @@ int main(int argc, char *argv[]) { if (argv[i][1] == 'i') { input_dataset_prefix = argv[i+1]; } else if (argv[i][1] == 'e') { load_filename = argv[i+1]; } else if (argv[i][1] == 'o') { output_log_file = argv[i+1]; } - else if (argv[i][1] == 'b') { B = atoi(argv[i+1]); } // Per-GPU batch size + else if (argv[i][1] == 'b') { B = atoi(argv[i+1]); } // Per-GPU (micro) batch size else if (argv[i][1] == 't') { T = atoi(argv[i+1]); } + else if (argv[i][1] == 'd') { total_batch_size = atoi(argv[i+1]); } else if (argv[i][1] == 'l') { learning_rate = atof(argv[i+1]); } else if (argv[i][1] == 'x') { max_steps = atoi(argv[i+1]); } else if (argv[i][1] == 'v') { val_loss_every = atoi(argv[i+1]); } @@ -2679,16 +2681,19 @@ int main(int argc, char *argv[]) { else if (argv[i][1] == 'a') { overfit_single_batch = atoi(argv[i+1]); } else if (argv[i][1] == 'f') { override_enable_tf32 = atoi(argv[i+1]); } else if (argv[i][1] == 'w') { use_master_weights = atoi(argv[i+1]); } - else if (argv[i][1] == 'z') { zero_stage = atoi(argv[i+1]); } + else if (argv[i][1] == 'z') { zero_stage = atoi(argv[i+1]); } else { error_usage(); } } + // calculate a sensible default for total batch size by assuming no gradient accumulation + if (total_batch_size == -1) { total_batch_size = B * T * multi_gpu_config.num_processes; } printf0("+-----------------------+----------------------------------------------------+\n"); printf0("| Parameter | Value |\n"); printf0("+-----------------------+----------------------------------------------------+\n"); printf0("| input dataset prefix | %-50s |\n", input_dataset_prefix); printf0("| output log file | %-50s |\n", output_log_file == NULL ? "NULL" : output_log_file); - printf0("| batch size B | %-50d |\n", B); + printf0("| micro batch size B | %-50d |\n", B); printf0("| sequence length T | %-50d |\n", T); + printf0("| total batch size | %-50d |\n", total_batch_size); printf0("| learning rate | %-50e |\n", learning_rate); printf0("| max_steps | %-50d |\n", max_steps); printf0("| val_loss_every | %-50d |\n", val_loss_every); @@ -2747,9 +2752,17 @@ int main(int argc, char *argv[]) { printf0("+-----------------------+----------------------------------------------------+\n"); // more prints related to allocations from gpt2_build_from_checkpoint down here to not mess up our table above - printf0("num_parameters: %zu ==> bytes: %zu\n", model.num_parameters, model.num_parameters_bytes); + printf0("num_parameters: %zu => bytes: %zu\n", model.num_parameters, model.num_parameters_bytes); printf0("allocated %d MiB for model parameters\n", (int)round(model.num_parameters_bytes / (1024 * 1024))); + // figure out gradient accumulation from the desired total batch size + int tokens_per_fwdbwd = B * T * multi_gpu_config.num_processes; // one micro-batch processes this many tokens + assert(total_batch_size % tokens_per_fwdbwd == 0); + int grad_accum_steps = total_batch_size / tokens_per_fwdbwd; + printf0("batch_size B=%d * seq_len T=%d * num_processes=%d and total_batch_size=%d\n", + B, T, multi_gpu_config.num_processes, total_batch_size); + printf0("=> setting grad_accum_steps=%d\n", grad_accum_steps); + // set up the Logger & Tokenizer Logger logger; logger_init(&logger, output_log_file); @@ -2841,30 +2854,47 @@ int main(int argc, char *argv[]) { // the validation/sampling one last time, and then we break right here as we're done. if (last_step) { break; } - // do a training step + // --------------- TRAINING SECTION BEGIN ----------------- + // do one training step, doing forward/backward/update on total_batch_size tokens cudaEventRecord(start); - if (overfit_single_batch == 0 || (step == 0 && overfit_single_batch == 1)) { - // if we're overfitting a single batch, we'll only call this at step = 0 - dataloader_next_batch(&train_loader); + // gradient accumulation loop over micro-batches + float lossf = 0.0f; // for getting the mean loss over the accumulation steps + for (int micro_step = 0; micro_step < grad_accum_steps; micro_step++) { + // fetch the next data batch + // and if we're overfitting a single batch, we'll only call this a single time + if (overfit_single_batch == 0 || + (overfit_single_batch == 1 && step == 0 && micro_step == 0)) { + dataloader_next_batch(&train_loader); + } + // forward pass. note that we pass in grad_accum_steps, which scales down the loss + gpt2_forward(&model, train_loader.inputs, train_loader.targets, B, T, true, grad_accum_steps); + lossf += model.mean_loss; // the mean_loss was normalized by grad_accum_steps inside gpt2_forward + // backward pass. all model params accumulate gradients with += inside this inner loop + gpt2_backward(&model); } - gpt2_forward(&model, train_loader.inputs, train_loader.targets, B, T, false); - gpt2_zero_grad(&model); - gpt2_backward(&model); -#ifndef MULTI_GPU + // override the mean loss, accounting for the gradient accumulation loop + // this is esp important to do here in multigpu update below, where model.mean_loss gets allreduced + model.mean_loss = lossf; + // update the parameters +#ifndef MULTI_GPU gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1); #else gpt2_multi_gpu_accumulate(&model, &multi_gpu_config); gpt2_multi_gpu_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1, &multi_gpu_config); gpt2_multi_gpu_gather(&model, &multi_gpu_config); #endif + // zero out the gradients for the next iteration + gpt2_zero_grad(&model); + cudaEventRecord(end); + cudaCheck(cudaEventSynchronize(end)); // wait for the end event to finish to get correct timings + // --------------- TRAINING SECTION END ------------------- + // everything that follows now is just diagnostics, prints, logging, etc. // todo - move or double-buffer all of this timing logic to avoid idling the GPU at this point! - cudaEventRecord(end); float time_elapsed_ms; - cudaCheck(cudaEventSynchronize(end)); // wait for the end event to finish to get correct timings cudaCheck(cudaEventElapsedTime(&time_elapsed_ms, start, end)); - - float tokens_per_second = multi_gpu_config.num_processes * (B * T) / time_elapsed_ms * 1000.0; + size_t tokens_processed = (size_t)multi_gpu_config.num_processes * B * T * grad_accum_steps; + float tokens_per_second = tokens_processed / time_elapsed_ms * 1000.0; float bias_corrected_ema_tokens_per_second = tokens_per_second; // by default set to non-ema version if (step > 0) { // consider the first batch to be a warmup (e.g. cuBLAS/cuDNN initialisation) total_sum_iteration_time_s += time_elapsed_ms / 1000.0; diff --git a/train_gpt2.py b/train_gpt2.py index 80547b8f1..c50fc7b61 100644 --- a/train_gpt2.py +++ b/train_gpt2.py @@ -404,8 +404,9 @@ def print0(*args, **kwargs): parser.add_argument("--tensorcores", type=int, default=0, help="use tensorcores") parser.add_argument("--flash", type=int, default=0, help="use flash attention") parser.add_argument("--num_iterations", type=int, default=10, help="number of iterations to run") - parser.add_argument("--batch_size", type=int, default=4, help="batch size") + parser.add_argument("--batch_size", type=int, default=4, help="batch size, in units of #batch dimensions") parser.add_argument("--sequence_length", type=int, default=64, help="sequence length") + parser.add_argument("--total_batch_size", type=int, default=256, help="total desired batch size, in units of #tokens") args = parser.parse_args() B, T = args.batch_size, args.sequence_length assert 1 <= T <= 1024 @@ -443,6 +444,13 @@ def print0(*args, **kwargs): device = "mps" print(f"using device: {device}") + # calculate gradient accumulation from the desired total batch size and the current run configuration + tokens_per_fwdbwd = B * T * ddp_world_size + assert args.total_batch_size % tokens_per_fwdbwd == 0 + grad_accum_steps = args.total_batch_size // tokens_per_fwdbwd + print(f"total desired batch size: {args.total_batch_size}") + print(f"=> calculated gradient accumulation steps: {grad_accum_steps}") + # set up a context manager following the desired dtype and device ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[args.dtype] ctx = torch.amp.autocast(device_type="cuda", dtype=ptdtype) if device == "cuda" else nullcontext() @@ -544,14 +552,33 @@ def get_batch(): if device == "cuda": torch.cuda.reset_peak_memory_stats() timings = [] - for i in range(args.num_iterations): + for step in range(args.num_iterations): t0 = time.time() - with ctx: - _, loss = model(x, y, return_logits=False) - if not args.inference_only: - optimizer.zero_grad(set_to_none=True) - loss.backward() - optimizer.step() + + # micro-batch loop where we do gradient accumulation to reach desired total batch size + lossf = 0.0 # for getting the mean loss (as simple float) over the accumulation steps + for micro_step in range(grad_accum_steps): + # forward pass + with ctx: + _, loss = model(x, y, return_logits=False) + # we have to scale the loss to account for gradient accumulation, + # because the gradients just add on each successive backward(). + # addition of gradients corresponds to a SUM in the objective, but + # instead of a SUM we want MEAN, so we scale the loss here + loss = loss / grad_accum_steps + lossf += loss.item() # keep track of the mean loss + if ddp: + # we want only the last micro-step to sync grads in a DDP model + # the official way to do this is with model.no_sync(), but that is a + # context manager that bloats the code, so we just toggle this variable + model.require_backward_grad_sync = (micro_step == grad_accum_steps - 1) + # backward pass + if not args.inference_only: + loss.backward() + # todo: grad clip here + optimizer.step() + optimizer.zero_grad(set_to_none=True) + # wait on the CPU for all device work to end so we get accurate per-iteration timings below if device == "mps": torch.mps.synchronize() @@ -560,9 +587,9 @@ def get_batch(): # time and print t1 = time.time() # the 0th iteration is often an outlier (much slower) => skip logging it - tokens_per_second = ddp_world_size * B * T / (t1-t0) - print0(f"iteration {i+1}, loss: {loss.item():.4f}, time: {(t1-t0)*1000:.3f}ms, tok/s: {tokens_per_second:.2f}") - if i > 0 and i > args.num_iterations - 20: + tokens_per_second = grad_accum_steps * ddp_world_size * B * T / (t1-t0) + print0(f"iteration {step+1}, loss: {lossf:.4f}, time: {(t1-t0)*1000:.3f}ms, tok/s: {tokens_per_second:.2f}") + if step > 0 and step > args.num_iterations - 20: timings.append(t1-t0) # print the average of the last 20 timings, to get something smooth-ish From a4567ae940d527ea9e53b4f366517c472d16f09e Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 14 May 2024 22:13:54 +0000 Subject: [PATCH 074/172] delete parallels, still not fixed --- train_gpt2.cu | 48 +++++++----------------------------------------- 1 file changed, 7 insertions(+), 41 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index 69df89084..b5a2b1e5c 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -123,12 +123,7 @@ cublasHandle_t cublas_handle; cudaDeviceProp deviceProp; // CUDA streams & events (note: non-timing events, use separate events for timing/profiling!) -constexpr int num_parallel_streams = 2; // + 1 primary "main_stream" (+ default stream) -cudaStream_t parallel_streams[num_parallel_streams]; -cudaEvent_t parallel_events[num_parallel_streams]; cudaStream_t main_stream; -cudaEvent_t main_event; -cudaEvent_t loss_event; // to make sure fused_classifier has written the losses to the CPU buffer // convenience macro for calculating grid/block dimensions for kernels #define CEIL_DIV(M, N) (((M) + (N)-1) / (N)) @@ -1558,7 +1553,7 @@ void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias, if(grid_size_y == 1) { matmul_backward_bias_kernel8<<>>(dbias, dout, B, T, OC, std::bool_constant{}); } else { - cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float), main_stream); + cudaMemset(dbias_buffer, 0, OC * sizeof(float)); matmul_backward_bias_kernel8<<>>(dbias_buffer, dout, B, T, OC, std::bool_constant{}); cast_and_add_kernel<<>>(dbias, dbias_buffer, OC); } @@ -1586,7 +1581,7 @@ void layernorm_backward(floatX* dinp, floatX* dweight, floatX* dbias, float* scr const int grid_size = blocks_per_sm * deviceProp.multiProcessorCount; size_t shared_mem_size = (2 * C + 1) * sizeof(float); - cudaMemsetAsync(scratch, 0, (2 * C + 1) * sizeof(float), main_stream); + cudaMemset(scratch, 0, (2 * C + 1) * sizeof(float)); layernorm_backward_kernel8<<>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C); cudaCheck(cudaGetLastError()); } @@ -2041,11 +2036,10 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo // copy inputs/targets to the model // todo - inputs is copied on default stream so this synchronises CPU/GPU for now - cudaCheck(cudaMemcpyAsync(model->inputs, inputs, B * T * sizeof(int), cudaMemcpyHostToDevice, 0)); + cudaCheck(cudaMemcpy(model->inputs, inputs, B * T * sizeof(int), cudaMemcpyHostToDevice)); if (targets != NULL) { // memcpy targets in parallel then wait for them before fused_classifier - cudaCheck(cudaMemcpyAsync(model->targets, targets, B * T * sizeof(int), cudaMemcpyHostToDevice, parallel_streams[0])); - cudaEventRecord(parallel_events[0], parallel_streams[0]); + cudaCheck(cudaMemcpy(model->targets, targets, B * T * sizeof(int), cudaMemcpyHostToDevice)); } // forward pass @@ -2128,16 +2122,9 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo // also forward the cross-entropy loss function if we have the targets if (targets != NULL) { NvtxRange classifier_and_loss_range("classifier_and_loss"); - // wait on memcpy of targets (definitely finished by now, but better safe than sorry) - cudaStreamWaitEvent(main_stream, parallel_events[0], 0); // fused classifier: does the forward pass and first part of the backward pass const float dloss = 1.0f / (B * T * grad_accum_steps); // results in the uniform average loss over all elements fused_classifier(acts.output, model->cpu_losses, dloss, model->targets, B, T, V, Vp); - - // the GPU now writes the losses directly to the CPU buffer allocated with cudaMallocHost() - // we accumulate cpu_losses at the end of gpt2_backward() waiting on this event - cudaEventRecord(loss_event, main_stream); - // reset mean_loss here so gpt2_backward() knows we have targets model->mean_loss = 0.0f; } else { @@ -2148,7 +2135,6 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo // accumulate the loss immediately if we are not going to run gpt2_backward(), e.g. inference if (get_loss) { assert(targets != NULL); // makes no sense to request loss if we don't have targets - cudaCheck(cudaEventSynchronize(loss_event)); // hopefully finished long ago for (int i=0; imean_loss += (float)(model->cpu_losses[i]); } model->mean_loss /= B*T*grad_accum_steps; } @@ -2157,11 +2143,8 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo void gpt2_zero_grad(GPT2 *model) { NVTX_RANGE_FN(); if (model->grads_memory != NULL) { - cudaCheck(cudaMemsetAsync(model->grads_memory, 0, model->num_parameters * sizeof(floatX), parallel_streams[0])); + cudaCheck(cudaMemset(model->grads_memory, 0, model->num_parameters * sizeof(floatX))); } - // Allow this to run in parallel with forward pass, but create a dependency with everything after (backwards pass) - cudaEventRecord(parallel_events[0], parallel_streams[0]); - cudaStreamWaitEvent(main_stream, parallel_events[0], 0); } void gpt2_backward(GPT2 *model) { @@ -2207,10 +2190,7 @@ void gpt2_backward(GPT2 *model) { GradActTensors grads_acts = model->grads_acts; // reset residual stream gradients (put here to work with gradient accumulation) - cudaCheck(cudaMemsetAsync(model->grads_acts.residual3, 0, B * T * C * sizeof(floatX), parallel_streams[0])); - // allow the memset to run in parallel with the forward pass, but create a dependency with everything after - cudaEventRecord(parallel_events[0], parallel_streams[0]); - cudaStreamWaitEvent(main_stream, parallel_events[0], 0); + cudaCheck(cudaMemset(model->grads_acts.residual3, 0, B * T * C * sizeof(floatX))); // re-use the output buffer of the forward pass as a scratchpad during backward pass float* scratchF = (float*)acts.output; @@ -2302,7 +2282,6 @@ void gpt2_backward(GPT2 *model) { encoder_backward(grads.wte, grads.wpe, dresidual, model->inputs, B, T, C, random_u32(&model->rng_state)); // accumulate the loss, this was calculated at the end of gpt2_forward() - cudaCheck(cudaEventSynchronize(loss_event)); // hopefully finished long ago for (int i=0; imean_loss += (float)(model->cpu_losses[i]); } model->mean_loss /= B*T; } @@ -2439,12 +2418,6 @@ void common_start(bool override_enable_tf32 = true, bool print_device_info = tru } cudaCheck(cudaStreamCreate(&main_stream)); - cudaEventCreateWithFlags(&main_event, cudaEventDisableTiming); - cudaEventCreateWithFlags(&loss_event, cudaEventDisableTiming); - for (int i = 0; i < num_parallel_streams; i++) { - cudaCheck(cudaStreamCreate(¶llel_streams[i])); - cudaEventCreateWithFlags(¶llel_events[i], cudaEventDisableTiming); - } // set up cuBLAS and cuBLASLt (and cuDNN if enabled) cublasCheck(cublasCreate(&cublas_handle)); @@ -2463,14 +2436,7 @@ void common_start(bool override_enable_tf32 = true, bool print_device_info = tru } void common_free(GPT2 &model) { - cudaCheck(cudaEventDestroy(main_event)); - cudaCheck(cudaEventDestroy(loss_event)); - for (int i = 0; i < num_parallel_streams; i++) { - cudaCheck(cudaStreamDestroy(parallel_streams[i])); - cudaCheck(cudaEventDestroy(parallel_events[i])); - } cudaCheck(cudaStreamDestroy(main_stream)); - gpt2_free(&model); cudaCheck(cudaFree(cublaslt_workspace)); cublasCheck(cublasDestroy(cublas_handle)); @@ -2819,7 +2785,7 @@ int main(int argc, char *argv[]) { // we re-calculate the forward pass for all of (B,T) positions from scratch // but the inference here is just for sanity checking anyway // and we can maybe optimize a bit more later, with careful tests - gpt2_forward(&model, gen_tokens, NULL, B, T); + gpt2_forward(&model, gen_tokens, NULL, B, T, false); // furthermore, below we're only using b=0 (i.e. the first row) of all B rows // we're in principle running B "inference streams" in parallel here // only using position 0 because it's a bit faster (copy less probs from GPU -> CPU) From 3c3c965840b239480c87d3dfe04a70bf70986164 Mon Sep 17 00:00:00 2001 From: Azret Botash Date: Tue, 14 May 2024 17:16:24 -0700 Subject: [PATCH 075/172] Adding Mersenne Twisters C --- rand.h | 141 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 rand.h diff --git a/rand.h b/rand.h new file mode 100644 index 000000000..f69340d20 --- /dev/null +++ b/rand.h @@ -0,0 +1,141 @@ +#ifndef RAND_H +#define RAND_H + +#include + +#define MERSENNE_STATE_M 397u +#define MERSENNE_STATE_N 624u + +#define LMASK 0x7ffffffful +#define UMASK 0x80000000ul + +// Copyright(c) Makoto Matsumoto and Takuji Nishimura + +// This implementation follows PyTorch so that we are numerically identical when running verification tests. + +typedef struct { + unsigned long long seed_; + int left_; + unsigned int next_; + unsigned int state_[MERSENNE_STATE_N]; + unsigned int MATRIX_A[2]; +} mt19937_state; + +void manual_seed(mt19937_state* state, unsigned int seed) { + state->MATRIX_A[0] = 0x0u; + state->MATRIX_A[1] = 0x9908b0df; + state->state_[0] = seed & 0xffffffff; + for (unsigned int j = 1; j < MERSENNE_STATE_N; j++) { + state->state_[j] = 1812433253 * (state->state_[j - 1] ^ (state->state_[j - 1] >> 30)) + j; + state->state_[j] &= 0xffffffff; + } + state->left_ = 1; + state->next_ = 0; +} + +void next_state(mt19937_state* state) { + state->left_ = MERSENNE_STATE_N; + state->next_ = 0; + unsigned int y, j; + for (j = 0; j < MERSENNE_STATE_N - MERSENNE_STATE_M; j++) { + y = (state->state_[j] & UMASK) | (state->state_[j + 1] & LMASK); + state->state_[j] = state->state_[j + MERSENNE_STATE_M] ^ (y >> 1) ^ state->MATRIX_A[y & 0x1]; + } + for (; j < MERSENNE_STATE_N - 1; j++) { + y = (state->state_[j] & UMASK) | (state->state_[j + 1] & LMASK); + state->state_[j] = state->state_[j + (MERSENNE_STATE_M - MERSENNE_STATE_N)] ^ (y >> 1) ^ state->MATRIX_A[y & 0x1]; + } + y = (state->state_[MERSENNE_STATE_N - 1] & UMASK) | (state->state_[0] & LMASK); + state->state_[MERSENNE_STATE_N - 1] = state->state_[MERSENNE_STATE_M - 1] ^ (y >> 1) ^ state->MATRIX_A[y & 0x1]; +} + +unsigned int randint32(mt19937_state* state) { + if (!state) return 0; + if (state->MATRIX_A[0] != 0 || state->MATRIX_A[1] != 0x9908b0df) manual_seed(state, 5489); // auto-initialize + if (--state->left_ <= 0) { + next_state(state); + } + unsigned int y = state->state_[state->next_++]; + y ^= y >> 11; + y ^= (y << 7) & 0x9d2c5680; + y ^= (y << 15) & 0xefc60000; + y ^= y >> 18; + return y; +} + +inline unsigned long long randint64(mt19937_state* state) { + return (((unsigned long long)(randint32(state)) << 32) | randint32(state)); +} + +inline float randfloat32(mt19937_state* state) { + return (randint32(state) & ((1ull << 24) - 1)) * (1.0f / (1ull << 24)); +} + +inline double randfloat64(mt19937_state* state) { + return (randint64(state) & ((1ull << 53) - 1)) * (1.0 / (1ull << 53)); +} + +void uniform_(float* data, unsigned int numel, float from, float to, mt19937_state* state) { + for (unsigned int t = 0; t < numel; t++) { + data[t] = randfloat32(state) * (to - from) + from; + } +} + +// Box–Muller transform + +void normal_fill_16(float* data, float mean, float std, mt19937_state* state) { + #define EPSILONE 1e-12 + for (unsigned int t = 0; t < 8; t++) { + float u1 = 1 - data[t]; + float u2 = data[t + 8]; + float radius = sqrtf(-2 * logf(u1 + EPSILONE)); + float theta = 2.0 * M_PI * u2; + data[t] = (radius * cosf(theta) * std + mean); + data[t + 8] = (radius * sinf(theta) * std + mean); + } +} + +void normal_fill(float* data, unsigned int numel, float mean, float std, mt19937_state* state) { + for (unsigned int t = 0; t < numel; t++) { + data[t] = randfloat32(state); + } + for (unsigned int i = 0; i < numel - 15; i += 16) { + normal_fill_16(data + i, mean, std, state); + } + if (numel % 16 != 0) { + // recompute the last 16 values + data = data + numel - 16; + for (unsigned int i = 0; i < 16; i++) { + data[i] = randfloat32(state); + } + normal_fill_16(data, mean, std, state); + } +} + +void normal_(float* data, unsigned int numel, float mean, float std, mt19937_state* state) { + #define EPSILONE 1e-12 + if (numel >= 16) { + normal_fill(data, numel, mean, std, state); + } + else { + double next_double_normal_sample; + int has_next_double_normal_sample = 0; + for (unsigned int t = 0; t < numel; t++) { + if (has_next_double_normal_sample) { + data[t] = (float)(next_double_normal_sample * std + mean); + has_next_double_normal_sample = 0; + continue; + } + // for numel < 16 we draw a double (float64) + float u1 = randfloat64(state); + float u2 = randfloat64(state); + float radius = sqrtf(-2 * logf(1 - u2 + EPSILONE)); + float theta = 2.0 * M_PI * u1; + next_double_normal_sample = radius * sinf(theta); + has_next_double_normal_sample = 1; + data[t] = (radius * cosf(theta) * std + mean); + } + } +} + +#endif \ No newline at end of file From 7de50af5115bff97980c9acb1a17f7fd875793aa Mon Sep 17 00:00:00 2001 From: Azret Botash Date: Tue, 14 May 2024 17:38:09 -0700 Subject: [PATCH 076/172] train_gpt.c: Removing the hardcoded GPT2_EOT --- train_gpt2.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/train_gpt2.c b/train_gpt2.c index 9706a2c0b..dbe4ca502 100644 --- a/train_gpt2.c +++ b/train_gpt2.c @@ -1067,9 +1067,6 @@ void dataloader_free(DataLoader *loader) { // ---------------------------------------------------------------------------- // sampler -// the GPT-2 end-of-text token id -#define GPT2_EOT 50256 - unsigned int random_u32(unsigned long long *state) { // xorshift rng: https://en.wikipedia.org/wiki/Xorshift#xorshift.2A *state ^= *state >> 12; @@ -1149,7 +1146,7 @@ int main() { if (step > 0 && step % 20 == 0) { // fill up gen_tokens with the GPT2_EOT, which kicks off the generation for(int i = 0; i < B * T; ++i) { - gen_tokens[i] = GPT2_EOT; + gen_tokens[i] = tokenizer.eot; } // now sample from the model autoregressively printf("generating:\n---\n"); From 16f9dad3011a0932e39f5762ec054863023455dd Mon Sep 17 00:00:00 2001 From: Azret Botash Date: Tue, 14 May 2024 17:45:08 -0700 Subject: [PATCH 077/172] Update train_gpt2.c --- train_gpt2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train_gpt2.c b/train_gpt2.c index dbe4ca502..95c46ab86 100644 --- a/train_gpt2.c +++ b/train_gpt2.c @@ -1146,7 +1146,7 @@ int main() { if (step > 0 && step % 20 == 0) { // fill up gen_tokens with the GPT2_EOT, which kicks off the generation for(int i = 0; i < B * T; ++i) { - gen_tokens[i] = tokenizer.eot; + gen_tokens[i] = tokenizer.eot_token; } // now sample from the model autoregressively printf("generating:\n---\n"); From 160b3bd007336bb5866deed7adf60cd9a6cf1a1f Mon Sep 17 00:00:00 2001 From: Azret Botash Date: Tue, 14 May 2024 18:22:39 -0700 Subject: [PATCH 078/172] Setting up dev/CPU area with the first matmul_forward.c --- dev/cpu/matmul_forward.c | 217 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 217 insertions(+) create mode 100644 dev/cpu/matmul_forward.c diff --git a/dev/cpu/matmul_forward.c b/dev/cpu/matmul_forward.c new file mode 100644 index 000000000..f7b714326 --- /dev/null +++ b/dev/cpu/matmul_forward.c @@ -0,0 +1,217 @@ +/* +CPU Kernels for matmul forward pass. +*/ + +// Compile Examples: +// +// MSVC: cl.exe /O2 /fp:fast /Qvec-report:2 /I. /I ..\..\dev matmul_forward.c +// cl.exe /O2 /fp:fast /Qvec-report:2 /arch:AVX /I. /I ..\..\dev matmul_forward.c +// cl.exe /O2 /fp:fast /Qvec-report:2 /arch:AVX2 /I. /I ..\..\dev matmul_forward.c +// + +#include +#include +#include +#include +#include + +// ---------------------------------------------------------------------------- +// CPU code reference + +void matmul_forward_cpu(float* out, + const float* inp, const float* weight, const float* bias, + int B, int T, int C, int OC) { + // OC is short for "output channels" + // inp is (B,T,C), weight is (OC, C), bias is (OC) + // out will be (B,T,OC) + for (int b = 0; b < B; b++) { + for (int t = 0; t < T; t++) { + float* out_bt = out + b * T * OC + t * OC; + const float* inp_bt = inp + b * T * C + t * C; + for (int o = 0; o < OC; o++) { + float val = (bias != NULL) ? bias[o] : 0.0f; + const float* wrow = weight + o*C; + for (int i = 0; i < C; i++) { + val += inp_bt[i] * wrow[i]; + } + out_bt[o] = val; + } + } + } +} + +void matmul_forward_ngc92(float* out, + const float* inp, const float* weight, const float* bias, + int B, int T, int C, int OC) { + // most of the running time is spent here and in matmul_backward + // OC is short for "output channels" + // inp is (B,T,C), weight is (OC, C), bias is (OC) + // out will be (B,T,OC) + + // make sure the tiled loop will be correct, otherwise, fallback to slow version + #define LOOP_UNROLL 8 + + if (B * T % LOOP_UNROLL != 0) { + printf("MUST BE A MULTIPLE OF 8"); // FIXME + return; + } + + // collapse the B and T loops into one and turn it into a strided loop. + // then we can tile the inner loop, and reuse the loaded weight LOOP_UNROLL many times + // for significant speed-ups. + for (int obt = 0; obt < B * T; obt += LOOP_UNROLL) { + for (int o = 0; o < OC; o++) { + // keep LOOP_UNROLL many results in register, initialized by the bias term. + float result[LOOP_UNROLL]; + for (int ibt = 0; ibt < LOOP_UNROLL; ++ibt) { + result[ibt] = (bias != NULL) ? bias[o] : 0.0f; + } + + // inner loops. Because we do LOOP_UNROLL steps of inner bt, we can cache + // the value of weight[i + o * C] and reuse it. + // we compile with -Ofast, so the compiler will turn the inner loop into a bunch of FMAs + for (int i = 0; i < C; i++) { + float w = weight[i + o * C]; + for (int ibt = 0; ibt < LOOP_UNROLL; ++ibt) { + int bt = obt + ibt; + result[ibt] += inp[bt * C + i] * w; + } + } + + // write back results to main memory + for (int ibt = 0; ibt < LOOP_UNROLL; ++ibt) { + int bt = obt + ibt; + out[bt * OC + o] = result[ibt]; + } + } + } +} + +#define NUM_KERNELS 2 + +void matmul_forward(int kernel_num, + float* out, + const float* inp, const float* weight, const float* bias, + int B, int T, int C, int OC) { + + switch (kernel_num) { + case 0: + matmul_forward_cpu(out, inp, weight, bias, B, T, C, OC); + break; + case 1: + matmul_forward_ngc92(out, inp, weight, bias, B, T, C, OC); + break; + default: + printf("Invalid kernel number\n"); + exit(1); + } +} + + +void validate_results_cpu(const float* device_result, const float* cpu_reference, const char* name, int num_elements, float tolerance); +float* make_random_float(size_t N); + +int main(int argc, char **argv) { + srand(0); + + int B = 8; + int T = 1024; + int C = 768; + int OC = 768 * 4; // expansion of 4, e.g. in the MLP + int RUNS = 4; // number of times to run a kernel for benchmarks + + srand(137); + + float* out = make_random_float(B * T * OC); + float* inp = make_random_float(B * T * C); + float* weight = make_random_float(OC * C); + float* bias = make_random_float(OC); + + float* grad_out = make_random_float(B * T * OC); + float* grad_inp = make_random_float(B * T * C); + float* grad_weight = make_random_float(OC * C); + float* grad_bias = make_random_float(OC); + + printf("> Calculating reference\n"); + matmul_forward_cpu(out, inp, weight, bias, B, T, C, OC); + + for (int kernel_num = 0; kernel_num < NUM_KERNELS; kernel_num++) { + printf("> Verifying kernel #%d\n", kernel_num); + + srand(137); + + float* kernel_out = make_random_float(B * T * OC); + float* kernel_inp = make_random_float(B * T * C); + float* kernel_weight = make_random_float(OC * C); + float* kernel_bias = make_random_float(OC); + + matmul_forward(kernel_num, kernel_out, kernel_inp, kernel_weight, kernel_bias, B, T, C, OC); + + validate_results_cpu(kernel_out, out, "out", B * T * OC, 1e-5); + + free(kernel_out); + free(kernel_inp); + free(kernel_weight); + free(kernel_bias); + } + + printf("All kernels passed! Starting benchmarks.\n\n"); + + for (int kernel_num = 0; kernel_num < NUM_KERNELS; kernel_num++) { + printf("> Running kernel #%d\n", kernel_num); + struct timespec start, end; + clock_gettime(CLOCK_MONOTONIC, &start); + + for (int i = 0; i < RUNS; i++) { + matmul_forward(kernel_num, out, inp, weight, bias, B, T, C, OC); + } + + clock_gettime(CLOCK_MONOTONIC, &end); + double time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9; + printf("> Kernel #%d, (took %f ms)\n", kernel_num, time_elapsed_s * 1000); + } + + // free memory + free(out); + free(inp); + free(weight); + free(bias); + + free(grad_out); + free(grad_inp); + free(grad_weight); + free(grad_bias); + + return 0; +} + +float* make_random_float(size_t N) { + float* arr = (float*)malloc(N * sizeof(float)); + for (size_t i = 0; i < N; i++) { + arr[i] = ((float)rand() / RAND_MAX) * 2.0 - 1.0; // range -1..1 + } + return arr; +} + +void validate_results_cpu(const float* kernel_result, const float* cpu_reference, const char* name, int num_elements, float tolerance) { + int nfaults = 0; + for (int i = 0; i < num_elements; i++) { + // print the first few comparisons + if (i < 5) { + printf("%f %f\n", cpu_reference[i], kernel_result[i]); + } + float t_eff = tolerance + fabs(cpu_reference[i]); + // ensure correctness for all elements. + if (fabs(cpu_reference[i] - kernel_result[i]) > t_eff) { + printf("Mismatch of %s at %d: CPU_ref: %f vs CPU_new: %f\n", name, i, cpu_reference[i], kernel_result[i]); + nfaults++; + if (nfaults >= 10) { + exit(EXIT_FAILURE); + } + } + } + if (nfaults > 0) { + exit(EXIT_FAILURE); + } + printf("OK\n"); +} \ No newline at end of file From 8eb3a432d991bf23bed3fae9f92dd395015ea457 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Wed, 15 May 2024 21:09:46 +0000 Subject: [PATCH 079/172] revert all streams and synchronization events, we'll bring them back but only one at a time and very very carefully, because they cause really subtle bugs and issues --- train_gpt2.cu | 96 ++++++++++++++++++++++----------------------------- 1 file changed, 41 insertions(+), 55 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index b5a2b1e5c..b53f7712c 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -122,9 +122,6 @@ cublasLtHandle_t cublaslt_handle; cublasHandle_t cublas_handle; cudaDeviceProp deviceProp; -// CUDA streams & events (note: non-timing events, use separate events for timing/profiling!) -cudaStream_t main_stream; - // convenience macro for calculating grid/block dimensions for kernels #define CEIL_DIV(M, N) (((M) + (N)-1) / (N)) @@ -1327,7 +1324,7 @@ void encoder_forward(floatX* out, const int block_size = 256; const int N = B * T * C; const int grid_size = CEIL_DIV(N, (int)(block_size * x128::size)); - encoder_forward_kernel3<<>>(out, inp, wte, wpe, B, T, C); + encoder_forward_kernel3<<>>(out, inp, wte, wpe, B, T, C); cudaCheck(cudaGetLastError()); } @@ -1338,7 +1335,7 @@ void encoder_backward(floatX* dwte, floatX* dwpe, const int N = B * T * C; const int block_size = 256; const int grid_size = CEIL_DIV(N, block_size * 2); // each thread handles 2 elements - encoder_backward_kernel<<>>(dwte, dwpe, dout, inp, B, T, C, seed); + encoder_backward_kernel<<>>(dwte, dwpe, dout, inp, B, T, C, seed); cudaCheck(cudaGetLastError()); } @@ -1349,7 +1346,7 @@ void layernorm_forward(floatX* out, floatX* mean, floatX* rstd, const int block_size = 512; const int N = B * T; const int grid_size = CEIL_DIV(N * 32, block_size); - layernorm_forward_kernel3<<>>(out, mean, rstd, inp, weight, bias, N, C); + layernorm_forward_kernel3<<>>(out, mean, rstd, inp, weight, bias, N, C); cudaCheck(cudaGetLastError()); } @@ -1413,7 +1410,7 @@ void matmul_forward_cublaslt(floatX* out, cublasCheck(cublasLtMatmul(cublaslt_handle, operationDesc, &alpha, weight, weightLayout, inp, inputLayout, &beta, out, outputLayout, out, outputLayout, &heuristic.algo, - cublaslt_workspace, cublaslt_workspace_size, main_stream)); + cublaslt_workspace, cublaslt_workspace_size, 0)); // cleanups cublasCheck(cublasLtMatmulPreferenceDestroy(preference)); @@ -1445,7 +1442,7 @@ void attention_forward(floatX* out, floatX* qkvr, floatX* att, v = qkvr + 2 * B * T * C; int total_threads = B * NH * T * HS; int num_blocks = CEIL_DIV(total_threads, block_size); - permute_kernel<<>>(q, k, v, inp, B, T, NH, HS); + permute_kernel<<>>(q, k, v, inp, B, T, NH, HS); floatX* preatt = inp; @@ -1460,7 +1457,7 @@ void attention_forward(floatX* out, floatX* qkvr, floatX* att, // multiply all elements of preatt elementwise by scale float scale = 1.0 / sqrtf(HS); int grid_size = CEIL_DIV(B * NH * T * 32, block_size); - softmax_forward_kernel5<<>>(att, scale, preatt, B * NH, T); + softmax_forward_kernel5<<>>(att, scale, preatt, B * NH, T); // new approach: first cuBLAS another batched matmul floatX* vaccum = inp; @@ -1476,7 +1473,7 @@ void attention_forward(floatX* out, floatX* qkvr, floatX* att, // now unpermute // y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side num_blocks = CEIL_DIV(B * T * C, block_size); - unpermute_kernel<<>>(vaccum, out, B, T, NH, HS); + unpermute_kernel<<>>(vaccum, out, B, T, NH, HS); cudaCheck(cudaGetLastError()); } @@ -1484,7 +1481,7 @@ void residual_forward(floatX* out, const floatX* inp1, const floatX* inp2, int N NVTX_RANGE_FN(); const int block_size = 256; const int grid_size = CEIL_DIV(N, block_size * x128::size); - residual_forward_kernel<<>>(out, inp1, inp2, N); + residual_forward_kernel<<>>(out, inp1, inp2, N); cudaCheck(cudaGetLastError()); } @@ -1517,7 +1514,7 @@ void gelu_forward(floatX* out, const floatX* inp, int N) { NVTX_RANGE_FN(); const int block_size = 512; const int grid_size = CEIL_DIV(N, block_size * x128::size); - gelu_forward_kernel2<<>>(out, inp, N); + gelu_forward_kernel2<<>>(out, inp, N); cudaCheck(cudaGetLastError()); } @@ -1525,7 +1522,7 @@ void gelu_backward(floatX* dinp, const floatX* inp, const floatX* dout, const in NVTX_RANGE_FN(); const int block_size = 128; const int grid_size = CEIL_DIV(N, block_size * x128::size); - gelu_backward_kernel<<>>(dinp, inp, dout, N); + gelu_backward_kernel<<>>(dinp, inp, dout, N); cudaCheck(cudaGetLastError()); } @@ -1551,11 +1548,11 @@ void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias, // If we have enough OC that we don't need cross-block reductions, we can skip the bias_buffer accumulation // and write results directly to the output. if(grid_size_y == 1) { - matmul_backward_bias_kernel8<<>>(dbias, dout, B, T, OC, std::bool_constant{}); + matmul_backward_bias_kernel8<<>>(dbias, dout, B, T, OC, std::bool_constant{}); } else { cudaMemset(dbias_buffer, 0, OC * sizeof(float)); - matmul_backward_bias_kernel8<<>>(dbias_buffer, dout, B, T, OC, std::bool_constant{}); - cast_and_add_kernel<<>>(dbias, dbias_buffer, OC); + matmul_backward_bias_kernel8<<>>(dbias_buffer, dout, B, T, OC, std::bool_constant{}); + cast_and_add_kernel<<>>(dbias, dbias_buffer, OC); } } @@ -1582,7 +1579,7 @@ void layernorm_backward(floatX* dinp, floatX* dweight, floatX* dbias, float* scr size_t shared_mem_size = (2 * C + 1) * sizeof(float); cudaMemset(scratch, 0, (2 * C + 1) * sizeof(float)); - layernorm_backward_kernel8<<>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C); + layernorm_backward_kernel8<<>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C); cudaCheck(cudaGetLastError()); } @@ -1610,7 +1607,7 @@ void attention_backward(floatX* dinp, floatX* dqkvr, floatX* dpreatt, floatX* da // backward through the unpermute operation int num_blocks = CEIL_DIV(B * T * C, block_size); - unpermute_kernel_backward<<>>(scratch, dout, B, T, NH, HS); + unpermute_kernel_backward<<>>(scratch, dout, B, T, NH, HS); // backward into datt cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, T, T, HS, &alpha, v, CUBLAS_LOWP, HS, T * HS, scratch, CUBLAS_LOWP, HS, T * HS, &beta, @@ -1622,7 +1619,7 @@ void attention_backward(floatX* dinp, floatX* dqkvr, floatX* dpreatt, floatX* da // backward into preatt int hs = C / NH; // head size float scale = 1.0f / sqrtf(hs); - softmax_autoregressive_backward_kernel<<>>(dpreatt, datt, att, B, T, C, scale); + softmax_autoregressive_backward_kernel<<>>(dpreatt, datt, att, B, T, C, scale); // backward into q cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, HS, T, T, &alpha, k, CUBLAS_LOWP, HS, T * HS, dpreatt, CUBLAS_LOWP, T, T * T, &beta, @@ -1633,7 +1630,7 @@ void attention_backward(floatX* dinp, floatX* dqkvr, floatX* dpreatt, floatX* da dk, CUBLAS_LOWP, HS, T * HS, B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT)); // backward into inp num_blocks = CEIL_DIV(B * NH * T * HS, block_size); - permute_kernel_backward<<>>(dinp, dq, dk, dv, B, T, NH, HS); + permute_kernel_backward<<>>(dinp, dq, dk, dv, B, T, NH, HS); cudaCheck(cudaGetLastError()); } @@ -1646,7 +1643,7 @@ void fused_classifier(Type* logits, Type* losses, const int block_size = 1024; const int N = B * T; const int grid_size = N; - fused_classifier_kernel5<<>>(logits, losses, (floatX*)NULL, dloss, targets, B, T, V, P); + fused_classifier_kernel5<<>>(logits, losses, (floatX*)NULL, dloss, targets, B, T, V, P); cudaCheck(cudaGetLastError()); } @@ -1738,7 +1735,7 @@ void* malloc_and_point_parameters(ParameterTensors* params, size_t* param_elemen return params_memory; } -#define NUM_ACTIVATION_TENSORS 20 +#define NUM_ACTIVATION_TENSORS 21 typedef struct { floatX* encoded; // (B, T, C) floatX* ln1; // (L, B, T, C) @@ -1758,6 +1755,7 @@ typedef struct { floatX* lnf; // (B, T, C) floatX* lnf_mean; // (B, T) floatX* lnf_rstd; // (B, T) + floatX* losses; // (B, T) // adding these two compared to the CPU .c code, needed for attention kernel as buffers floatX* qkvr; // (L, B, T, 3*C) // in inference mode, this buffer will store the logits @@ -1796,8 +1794,9 @@ void fill_in_activation_sizes(size_t* act_sizes, size_t B, size_t T, GPT2Config act_sizes[15] = B * T * C; // lnf act_sizes[16] = B * T; // lnf_mean act_sizes[17] = B * T; // lnf_rstd - act_sizes[18] = L * B * T * 3*C; // qkvr - act_sizes[19] = B * T * max(3*C, max(NH*T, Vp)); // output / scratch + act_sizes[18] = B * T; // losses + act_sizes[19] = L * B * T * 3*C; // qkvr + act_sizes[20] = B * T * max(3*C, max(NH*T, Vp)); // output / scratch } // Backward pass is conceptually quite different from forward, because we can discard @@ -1848,7 +1847,7 @@ void* malloc_and_point_activations(ActivationTensors* acts, const size_t* act_si &acts->encoded, &acts->ln1, &acts->ln1_mean, &acts->ln1_rstd, &acts->atty, &acts->att, &acts->attproj, &acts->residual2, &acts->ln2, &acts->ln2_mean, &acts->ln2_rstd, &acts->fch, &acts->fch_gelu, &acts->fcproj, &acts->residual3, &acts->lnf, - &acts->lnf_mean, &acts->lnf_rstd, &acts->qkvr, &acts->output + &acts->lnf_mean, &acts->lnf_rstd, &acts->losses, &acts->qkvr, &acts->output }; return malloc_and_point(ptrs, act_sizes, NUM_ACTIVATION_TENSORS); } @@ -1980,7 +1979,7 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) { model->use_master_weights = 1; // keep master weights copy in float for optim update? } -void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bool get_loss=true, int grad_accum_steps=1) { +void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, int grad_accum_steps=1) { NVTX_RANGE_FN(); // targets are optional and could be NULL // in this function we must be careful and use size_t instead of int, otherwise @@ -2038,7 +2037,6 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo // todo - inputs is copied on default stream so this synchronises CPU/GPU for now cudaCheck(cudaMemcpy(model->inputs, inputs, B * T * sizeof(int), cudaMemcpyHostToDevice)); if (targets != NULL) { - // memcpy targets in parallel then wait for them before fused_classifier cudaCheck(cudaMemcpy(model->targets, targets, B * T * sizeof(int), cudaMemcpyHostToDevice)); } @@ -2124,20 +2122,17 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, bo NvtxRange classifier_and_loss_range("classifier_and_loss"); // fused classifier: does the forward pass and first part of the backward pass const float dloss = 1.0f / (B * T * grad_accum_steps); // results in the uniform average loss over all elements - fused_classifier(acts.output, model->cpu_losses, dloss, model->targets, B, T, V, Vp); - // reset mean_loss here so gpt2_backward() knows we have targets - model->mean_loss = 0.0f; + fused_classifier(acts.output, acts.losses, dloss, model->targets, B, T, V, Vp); + // for convenience also evaluate the mean loss (TODO re-think this compute+sync point) + cudaCheck(cudaMemcpy(model->cpu_losses, acts.losses, B * T * sizeof(floatX), cudaMemcpyDeviceToHost)); + float mean_loss = 0.0f; + for (int i = 0; i < B*T; i++) { mean_loss += (float)(model->cpu_losses[i]); } + mean_loss /= B*T*grad_accum_steps; + model->mean_loss = mean_loss; } else { // if we don't have targets, we don't have loss model->mean_loss = -1.0f; } - - // accumulate the loss immediately if we are not going to run gpt2_backward(), e.g. inference - if (get_loss) { - assert(targets != NULL); // makes no sense to request loss if we don't have targets - for (int i=0; imean_loss += (float)(model->cpu_losses[i]); } - model->mean_loss /= B*T*grad_accum_steps; - } } void gpt2_zero_grad(GPT2 *model) { @@ -2280,10 +2275,6 @@ void gpt2_backward(GPT2 *model) { layernorm_backward(dresidual, dl_ln1w, dl_ln1b, scratchF, dl_btc, residual, l_ln1w, l_ln1_mean, l_ln1_rstd, B, T, C); } encoder_backward(grads.wte, grads.wpe, dresidual, model->inputs, B, T, C, random_u32(&model->rng_state)); - - // accumulate the loss, this was calculated at the end of gpt2_forward() - for (int i=0; imean_loss += (float)(model->cpu_losses[i]); } - model->mean_loss /= B*T; } // Compute a mean of a single CPU value across all GPU processes. No-op when multi-GPU is disabled. @@ -2303,7 +2294,7 @@ float multi_gpu_cpu_float_mean(float value, const MultiGpuConfig* multi_gpu_conf void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) { #ifdef MULTI_GPU NVTX_RANGE_FN(); - if (multi_gpu_config->num_processes == 1) return; + if (multi_gpu_config->num_processes == 1) { return; } // Average all losses. model->accumulated_mean_loss = multi_gpu_cpu_float_mean(model->mean_loss, multi_gpu_config); // Average all gradients. @@ -2311,7 +2302,7 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) { model->num_parameters, ncclFloatX, ncclAvg, multi_gpu_config->nccl_comm, - main_stream)); + 0)); #endif } @@ -2330,7 +2321,7 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo if (model->use_master_weights == 1) { // allocate one more buffer to keep the master copy of weights as float, and copy the weights over cudaCheck(cudaMalloc((void**)&model->master_weights, model->num_parameters * sizeof(float))); - copy_and_cast_kernel<<num_parameters, 512), 512, 0, main_stream>>>(model->master_weights, (floatX*)model->params_memory, model->num_parameters); + copy_and_cast_kernel<<num_parameters, 512), 512>>>(model->master_weights, (floatX*)model->params_memory, model->num_parameters); cudaCheck(cudaGetLastError()); printf0("allocated %zu MiB for master copy of params\n", (model->num_parameters * sizeof(float)) >> 20); } @@ -2341,7 +2332,7 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo float beta1_correction = 1.0f - powf(beta1, t); float beta2_correction = 1.0f - powf(beta2, t); unsigned int seed = random_u32(&model->rng_state); - adamw_kernel3<<>>((floatX*)model->params_memory, model->master_weights, + adamw_kernel3<<>>((floatX*)model->params_memory, model->master_weights, (floatX*)model->grads_memory, model->m_memory, model->v_memory, model->num_parameters, learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed); @@ -2363,7 +2354,7 @@ void gpt2_multi_gpu_update(GPT2 *model, float learning_rate, float beta1, float printf0("allocated %zu MiB for AdamW optimizer state v\n", (num_parameters * sizeof(float)) >> 20); if (model->use_master_weights == 1) { cudaCheck(cudaMalloc((void**)&model->master_weights, num_parameters * sizeof(float))); - copy_and_cast_kernel<<>>(model->master_weights, params_memory, num_parameters); + copy_and_cast_kernel<<>>(model->master_weights, params_memory, num_parameters); cudaCheck(cudaGetLastError()); printf0("allocated %zu MiB for master copy of params\n", (num_parameters * sizeof(float)) >> 20); } @@ -2374,7 +2365,7 @@ void gpt2_multi_gpu_update(GPT2 *model, float learning_rate, float beta1, float float beta1_correction = 1.0f - powf(beta1, t); float beta2_correction = 1.0f - powf(beta2, t); unsigned int seed = random_u32(&model->rng_state); - adamw_kernel3<<>>(params_memory, model->master_weights, grads_memory, + adamw_kernel3<<>>(params_memory, model->master_weights, grads_memory, model->m_memory, model->v_memory, num_parameters, learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed); cudaCheck(cudaGetLastError()); @@ -2383,8 +2374,7 @@ void gpt2_multi_gpu_update(GPT2 *model, float learning_rate, float beta1, float void gpt2_multi_gpu_gather(GPT2 *model, MultiGpuConfig* multi_gpu_config) { #ifdef MULTI_GPU - if (multi_gpu_config->num_processes == 1) return; - + if (multi_gpu_config->num_processes == 1) { return; } // 1 process => noop if (multi_gpu_config->zero_stage == 1) { // gather updated shards of model->params_memory from each process ncclCheck(ncclAllGather((floatX*)model->params_memory + multi_gpu_config->shard_offset, (floatX*)model->params_memory, @@ -2417,11 +2407,8 @@ void common_start(bool override_enable_tf32 = true, bool print_device_info = tru printf("Device %d: %s\n", multi_gpu_config.local_device_idx, deviceProp.name); } - cudaCheck(cudaStreamCreate(&main_stream)); - // set up cuBLAS and cuBLASLt (and cuDNN if enabled) cublasCheck(cublasCreate(&cublas_handle)); - cublasCheck(cublasSetStream(cublas_handle, main_stream)); cublasCheck(cublasLtCreate(&cublaslt_handle)); cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size)); @@ -2436,7 +2423,6 @@ void common_start(bool override_enable_tf32 = true, bool print_device_info = tru } void common_free(GPT2 &model) { - cudaCheck(cudaStreamDestroy(main_stream)); gpt2_free(&model); cudaCheck(cudaFree(cublaslt_workspace)); cublasCheck(cublasDestroy(cublas_handle)); @@ -2785,7 +2771,7 @@ int main(int argc, char *argv[]) { // we re-calculate the forward pass for all of (B,T) positions from scratch // but the inference here is just for sanity checking anyway // and we can maybe optimize a bit more later, with careful tests - gpt2_forward(&model, gen_tokens, NULL, B, T, false); + gpt2_forward(&model, gen_tokens, NULL, B, T); // furthermore, below we're only using b=0 (i.e. the first row) of all B rows // we're in principle running B "inference streams" in parallel here // only using position 0 because it's a bit faster (copy less probs from GPU -> CPU) @@ -2833,7 +2819,7 @@ int main(int argc, char *argv[]) { dataloader_next_batch(&train_loader); } // forward pass. note that we pass in grad_accum_steps, which scales down the loss - gpt2_forward(&model, train_loader.inputs, train_loader.targets, B, T, true, grad_accum_steps); + gpt2_forward(&model, train_loader.inputs, train_loader.targets, B, T, grad_accum_steps); lossf += model.mean_loss; // the mean_loss was normalized by grad_accum_steps inside gpt2_forward // backward pass. all model params accumulate gradients with += inside this inner loop gpt2_backward(&model); From 2ccdfb70e0656014e30ab2843d4c539259f809a4 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Tue, 14 May 2024 10:44:12 +0300 Subject: [PATCH 080/172] general cleanup --- dev/cuda/matmul_backward_bias.cu | 66 ++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu index 0bf5e44dd..820741b0f 100644 --- a/dev/cuda/matmul_backward_bias.cu +++ b/dev/cuda/matmul_backward_bias.cu @@ -324,45 +324,50 @@ __global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout // kernel launcher // version1: simple cuBLAS calls -void matmul_backward_bias1(floatX* dbias, floatX* dout, - int B, int T, int C, int OC, int block_size) { +void matmul_backward_bias1(floatX* dbias, const floatX* dout, + int B, int T, int OC, int block_size) { dim3 block_dim(block_size); dim3 grid_dim(OC); size_t shared_mem_size = block_size * sizeof(float); matmul_backward_bias_kernel1<<>>(dbias, dout, B, T, OC); + cudaCheck(cudaGetLastError()); } -void matmul_backward_bias2(floatX* dbias, floatX* dout, - int B, int T, int C, int OC, int block_size) { +void matmul_backward_bias2(floatX* dbias, const floatX* dout, + int B, int T, int OC, int block_size) { // block_size 512 seems best const int grid_size = ceil_div(OC * 32, block_size); matmul_backward_bias_kernel2<<>>(dbias, dout, B, T, OC); + cudaCheck(cudaGetLastError()); } -void matmul_backward_bias3(floatX* dbias, floatX* dout, - int B, int T, int C, int OC, int block_size) { +void matmul_backward_bias3(floatX* dbias, const floatX* dout, + int B, int T, int OC, int block_size) { // block_size 256 seems best matmul_backward_bias_kernel3<<>>(dbias, dout, B, T, OC); + cudaCheck(cudaGetLastError()); } -void matmul_backward_bias4(floatX* dbias, floatX* dout, - int B, int T, int C, int OC, int block_size) { +void matmul_backward_bias4(floatX* dbias, const floatX* dout, + int B, int T, int OC, int block_size) { assert(OC % 32 == 0); // OC must be divisible by 32 for this kernel const int grid_size = OC / 32; matmul_backward_bias_kernel4<<>>(dbias, dout, B, T, OC); + cudaCheck(cudaGetLastError()); } #ifndef ENABLE_BF16 -void matmul_backward_bias5(floatX* dbias, floatX* dout, - int B, int T, int C, int OC, int block_size) { +void matmul_backward_bias5(floatX* dbias, const floatX* dout, + int B, int T, int OC, int block_size) { const int grid_size_x = ceil_div(OC, block_size); const int grid_size_y = max(1, cuda_threads_per_SM * cuda_num_SMs / block_size); matmul_backward_bias_kernel5<<>>(dbias, dout, B, T, OC); + cudaCheck(cudaGetLastError()); } #endif -void matmul_backward_bias7(floatX* dbias, floatX* dout, - int B, int T, int C, int OC, int block_size) { +void matmul_backward_bias7(floatX* dbias, const floatX* dout, + int B, int T, int OC, int block_size) { if(block_size < 256) { block_size = 256; } @@ -381,14 +386,16 @@ void matmul_backward_bias7(floatX* dbias, floatX* dout, assert(block_size_y >= x128::size); // part of the kernel assumes this is large enough to avoid loops - cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float)); + cudaCheck(cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float))); matmul_backward_bias_kernel7<<>>(dbias_buffer, dout, B, T, OC, block_size); + dim3(block_size_x, block_size_y), OC_per_warp * sizeof(float)>>>(dbias_buffer, dout, B, T, OC, block_size); + cudaCheck(cudaGetLastError()); cast_and_add_kernel<<>>(dbias, dbias_buffer, OC); + cudaCheck(cudaGetLastError()); } -void matmul_backward_bias8(floatX* dbias, floatX* dout, - int B, int T, int C, int OC, int block_size) { +void matmul_backward_bias8(floatX* dbias, const floatX* dout, + int B, int T, int OC, int block_size) { dim3 block_dim = {4, 8, (unsigned)block_size/32}; const int OC_per_warp = block_dim.y * x128::size; // 64 at BF16 const int grid_size_x = ceil_div(OC, OC_per_warp); // e.g. 12 horizontal blocks for 768 OCs at BF16 @@ -398,41 +405,44 @@ void matmul_backward_bias8(floatX* dbias, floatX* dout, // and write results directly to the output. if(grid_size_y == 1) { matmul_backward_bias_kernel8<<>>(dbias, dout, B, T, OC, std::bool_constant{}); + cudaCheck(cudaGetLastError()); } else { - cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float)); + cudaCheck(cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float))); matmul_backward_bias_kernel8<<>>(dbias_buffer, dout, B, T, OC, std::bool_constant{}); + cudaCheck(cudaGetLastError()); cast_and_add_kernel<<>>(dbias, dbias_buffer, OC); + cudaCheck(cudaGetLastError()); } } void matmul_backward_bias(int kernel_num, floatX* dbias, floatX* dout, - int B, int T, int C, int OC, int block_size) { + int B, int T, int OC, int block_size) { switch (kernel_num) { case 1: - matmul_backward_bias1(dbias, dout, B, T, C, OC, block_size); + matmul_backward_bias1(dbias, dout, B, T, OC, block_size); break; case 2: - matmul_backward_bias2(dbias, dout, B, T, C, OC, block_size); + matmul_backward_bias2(dbias, dout, B, T, OC, block_size); break; case 3: - matmul_backward_bias3(dbias, dout, B, T, C, OC, block_size); + matmul_backward_bias3(dbias, dout, B, T, OC, block_size); break; case 4: - matmul_backward_bias4(dbias, dout, B, T, C, OC, block_size); + matmul_backward_bias4(dbias, dout, B, T, OC, block_size); break; case 5: #ifndef ENABLE_BF16 - matmul_backward_bias5(dbias, dout, B, T, C, OC, block_size); + matmul_backward_bias5(dbias, dout, B, T, OC, block_size); #else fprintf(stderr, "Kernel 5 is only supported for fp32"); exit(1); #endif break; case 7: - matmul_backward_bias7(dbias, dout, B, T, C, OC, block_size); + matmul_backward_bias7(dbias, dout, B, T, OC, block_size); break; case 8: - matmul_backward_bias8(dbias, dout, B, T, C, OC, block_size); + matmul_backward_bias8(dbias, dout, B, T, OC, block_size); break; default: printf("Invalid kernel number\n"); @@ -466,7 +476,7 @@ int main(int argc, char **argv) { floatX* d_dout; cudaCheck(cudaMalloc(&d_dbias, OC * sizeof(floatX))); cudaCheck(cudaMalloc(&d_dout, B * T * OC * sizeof(floatX))); - cudaCheck(cudaMalloc(&dbias_buffer, OC * sizeof(float))); + cudaCheck(cudaMalloc(&dbias_buffer, OC * sizeof(float) * 32)); cudaCheck(memcpy_convert(d_dbias, dbias, OC)); cudaCheck(memcpy_convert(d_dout, dout, B * T * OC)); @@ -489,7 +499,7 @@ int main(int argc, char **argv) { // memset the bias to zero cudaCheck(cudaMemset(d_dbias, 0, OC * sizeof(floatX))); // calculate the GPU version - matmul_backward_bias(kernel_num, d_dbias, d_dout, B, T, C, OC, block_size); + matmul_backward_bias(kernel_num, d_dbias, d_dout, B, T, OC, block_size); // compare printf("Checking correctness...\n"); float tol = std::is_same_v ? 5e-3f : 1.0f; @@ -502,7 +512,7 @@ int main(int argc, char **argv) { int block_size = block_sizes[j]; int repeat_times = 2000; float elapsed_time = benchmark_kernel(repeat_times, matmul_backward_bias, kernel_num, - d_dbias, d_dout, B, T, C, OC, block_size); + d_dbias, d_dout, B, T, OC, block_size); printf("block_size %d time %.4f ms\n", block_size, elapsed_time); } From 858c6e6dae447470e716bd1f0f47931980ebb4f6 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Tue, 14 May 2024 18:40:57 +0300 Subject: [PATCH 081/172] deterministic kernel --- dev/cuda/matmul_backward_bias.cu | 120 +++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu index 820741b0f..12b167083 100644 --- a/dev/cuda/matmul_backward_bias.cu +++ b/dev/cuda/matmul_backward_bias.cu @@ -320,6 +320,101 @@ __global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout } } +// Like kernel 8, but instead of accumulating to the auxiliary buffer, it writes +// multiple values that need to be summed up in a separate kernel call. +// If UseAuxBuffer is false, gridDim.y has to be one, and results are added directly +// to dbias. +template +__global__ void matmul_backward_bias_kernel9(OutFloat* dbias, const floatX* dout, int B, int T, int OC, + std::bool_constant) { + constexpr const int bdx = 4; + constexpr const int bdy = 32 / bdx; + assert(blockDim.x == bdx); + assert(blockDim.y == bdy); + + int warp_d = (int)threadIdx.x; + int warp_c = (int)threadIdx.y; + int block_d = (int)threadIdx.z; + + const int OC_per_warp = bdy * x128::size; // 64 at BF16 + + int local_oc = warp_c * x128::size; + int global_oc = blockIdx.x * OC_per_warp + local_oc; + + int local_bt = warp_d + bdx * block_d; + int bt_per_block = bdx * blockDim.z; + + float accumulators[x128::size]; + for (int k = 0; k < x128::size; k++) { + accumulators[k] = 0.0f; + } + + if(global_oc < OC) { + // sum up over all bt within registers + for (int idx = blockIdx.y * bt_per_block + local_bt; idx < B * T; idx += gridDim.y * bt_per_block) { + x128 packed_dout = load128(dout + global_oc + idx*OC); + for (int k = 0; k < x128::size; k++) { + accumulators[k] += (float)packed_dout[k]; + } + } + } + + __shared__ float sub_results[x128::size][32][bdy]; + + // reduce within-warp results + for (int k = 0; k < x128::size; k++) { + float v = accumulators[k]; + v += __shfl_down_sync(0xffffffff, v, 1, 4); + v += __shfl_down_sync(0xffffffff, v, 2, 4); + if(warp_d == 0) { + sub_results[k][block_d][warp_c] = v; + } + } + __syncthreads(); + + // block-wide reductions + for (int k = block_d; k < x128::size; k += blockDim.z) { + float a = 0.f; + for (int r = warp_d; r < blockDim.z; r += bdx) { + float v = sub_results[k][r][warp_c]; + v += __shfl_down_sync(0xffffffff, v, 1, 4); + v += __shfl_down_sync(0xffffffff, v, 2, 4); + a += v; + } + if(warp_d == 0 && global_oc < OC) { + // coalesced, but not cacheline-sized + if constexpr (!UseAuxBuffer) { + dbias[global_oc + k] = (OutFloat)(a + (float)dbias[global_oc + k]); + } else { + dbias[global_oc + k + blockIdx.y * OC] = a; + } + } + } +} + + +__global__ void reduce_add_sum_kernel(floatX* dst, const float* src, size_t n, size_t m) { + const size_t idx = (blockIdx.x * blockDim.x + threadIdx.x) * f128::size; + assert(n % x128::size == 0); + if (idx < n) { + f128 acc; + for(int k = 0; k < f128::size; ++k) { + acc[k] = 0.f; + } + + for(int l = 0; l < m; ++l) { + f128 s = load128(src + idx + n * l); + for(int k = 0; k < f128::size; ++k) { + acc[k] += s[k]; + } + } + for(int k = 0; k < f128::size; ++k) { + dst[idx + k] = (floatX) ((float)dst[idx + k] + acc[k]); + } + } +} + + // ---------------------------------------------------------------------------- // kernel launcher @@ -415,6 +510,28 @@ void matmul_backward_bias8(floatX* dbias, const floatX* dout, } } + +void matmul_backward_bias9(floatX* dbias, const floatX* dout, + int B, int T, int OC, int block_size) { + dim3 block_dim = {4, 8, (unsigned)block_size/32}; + const int OC_per_warp = block_dim.y * x128::size; // 64 at BF16 + const int grid_size_x = ceil_div(OC, OC_per_warp); // e.g. 12 horizontal blocks for 768 OCs at BF16 + const int grid_size_y = max(1, cuda_threads_per_SM * cuda_num_SMs / (block_size * grid_size_x)); // full GPU! + + // If we have enough OC that we don't need cross-block reductions, we can skip the bias_buffer accumulation + // and write results directly to the output. + if(grid_size_y == 1) { + matmul_backward_bias_kernel9<<>>(dbias, dout, B, T, OC, std::bool_constant{}); + cudaCheck(cudaGetLastError()); + } else { + // kernel 9 overwrites temp buffer, so no need to memset + matmul_backward_bias_kernel9<<>>(dbias_buffer, dout, B, T, OC, std::bool_constant{}); + cudaCheck(cudaGetLastError()); + reduce_add_sum_kernel<<>>(dbias, dbias_buffer, OC, grid_size_y); + cudaCheck(cudaGetLastError()); + } +} + void matmul_backward_bias(int kernel_num, floatX* dbias, floatX* dout, int B, int T, int OC, int block_size) { switch (kernel_num) { @@ -444,6 +561,9 @@ void matmul_backward_bias(int kernel_num, floatX* dbias, floatX* dout, case 8: matmul_backward_bias8(dbias, dout, B, T, OC, block_size); break; + case 9: + matmul_backward_bias9(dbias, dout, B, T, OC, block_size); + break; default: printf("Invalid kernel number\n"); exit(1); From 7b810c1a3bf821d65bebd344b76c7c518392bd2a Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Thu, 16 May 2024 00:20:52 +0300 Subject: [PATCH 082/172] update main training script --- train_gpt2.cu | 55 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index b53f7712c..07d6b0018 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -896,13 +896,9 @@ __global__ void gelu_backward_kernel(floatX* dinp, const floatX* inp, const floa store128(dinp + idx, packed_dinp); } -// templated because if we have enough channels, we can write directly to the bf16 dbias buffer, and otherwise -// we need to write to a fp32 temp buffer. The `Atomic` argument indicates whether we add atomically. We cannot -// (easily) use a regular runtime `if(blockDim.y == 1)` runtime condition, because that doesn't compile for older -// GPUs. -template -__global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout, int B, int T, int OC, - std::bool_constant) { +template +__global__ void matmul_backward_bias_kernel9(OutFloat* dbias, const floatX* dout, int B, int T, int OC, + std::bool_constant) { constexpr const int bdx = 4; constexpr const int bdy = 32 / bdx; assert(blockDim.x == bdx); @@ -957,19 +953,37 @@ __global__ void matmul_backward_bias_kernel8(OutFloat* dbias, const floatX* dout v += __shfl_down_sync(0xffffffff, v, 2, 4); a += v; } - - // coalesced, but not cacheline-sized writes if(warp_d == 0 && global_oc < OC) { - // if we have only one block per result, no need for atomics - if constexpr (!Atomic) { + if constexpr (!UseAuxBuffer) { dbias[global_oc + k] = (OutFloat)(a + (float)dbias[global_oc + k]); } else { - atomicAdd(dbias + global_oc + k, a); + dbias[global_oc + k + blockIdx.y * OC] = a; } } } } +__global__ void reduce_add_sum_kernel(floatX* dst, const float* src, size_t n, size_t m) { + const size_t idx = (blockIdx.x * blockDim.x + threadIdx.x) * f128::size; + assert(n % x128::size == 0); + if (idx < n) { + f128 acc; + for(int k = 0; k < f128::size; ++k) { + acc[k] = 0.f; + } + + for(int l = 0; l < m; ++l) { + f128 s = load128(src + idx + n * l); + for(int k = 0; k < f128::size; ++k) { + acc[k] += s[k]; + } + } + for(int k = 0; k < f128::size; ++k) { + dst[idx + k] = (floatX) ((float)dst[idx + k] + acc[k]); + } + } +} + __global__ void __launch_bounds__(512, 3) // todo - any warnings on Turing with only 1024 threads? layernorm_backward_kernel8(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch, const floatX* dout, const floatX* inp, const floatX* weight, @@ -1308,12 +1322,6 @@ __global__ void copy_and_cast_kernel(Td* dst, const Ts* src, size_t n) { } } -__global__ void cast_and_add_kernel(floatX* dst, const float* src, size_t n) { - // used only for matmul_backward_bias kernel, a little bit embarassing TODO delete later - const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < n) { dst[idx] = (floatX)((float)dst[idx] + src[idx]); } // have to += because dbias is a paramater -} - // ---------------------------------------------------------------------------- // kernel launchers @@ -1548,11 +1556,14 @@ void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias, // If we have enough OC that we don't need cross-block reductions, we can skip the bias_buffer accumulation // and write results directly to the output. if(grid_size_y == 1) { - matmul_backward_bias_kernel8<<>>(dbias, dout, B, T, OC, std::bool_constant{}); + matmul_backward_bias_kernel9<<>>(dbias, dout, B, T, OC, std::bool_constant{}); + cudaCheck(cudaGetLastError()); } else { - cudaMemset(dbias_buffer, 0, OC * sizeof(float)); - matmul_backward_bias_kernel8<<>>(dbias_buffer, dout, B, T, OC, std::bool_constant{}); - cast_and_add_kernel<<>>(dbias, dbias_buffer, OC); + // kernel 9 overwrites temp buffer, so no need to memset + matmul_backward_bias_kernel9<<>>(dbias_buffer, dout, B, T, OC, std::bool_constant{}); + cudaCheck(cudaGetLastError()); + reduce_add_sum_kernel<<>>(dbias, dbias_buffer, OC, grid_size_y); + cudaCheck(cudaGetLastError()); } } From d48c3a494542d496601c2ca683a1802097f75ec7 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Thu, 16 May 2024 12:39:28 +0300 Subject: [PATCH 083/172] (optionally) recompute gelu activations to reduce activation memory --- train_gpt2.cu | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index b53f7712c..608034add 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -1766,7 +1766,7 @@ typedef struct { floatX* output; } ActivationTensors; -void fill_in_activation_sizes(size_t* act_sizes, size_t B, size_t T, GPT2Config config) { +void fill_in_activation_sizes(size_t* act_sizes, size_t B, size_t T, GPT2Config config, bool recompute) { size_t Vp = config.padded_vocab_size; size_t L = config.num_layers; size_t NH = config.num_heads; @@ -1788,7 +1788,14 @@ void fill_in_activation_sizes(size_t* act_sizes, size_t B, size_t T, GPT2Config act_sizes[9] = L * B * T; // ln2_mean act_sizes[10] = L * B * T; // ln2_rstd act_sizes[11] = L * B * T * 4*C; // fch - act_sizes[12] = L * B * T * 4*C; // fch_gelu + // fch_gelu; result of a pointwise op, we may want to recompute to save activation memory + if (recompute) { + // if we recompute gelus, we just use the scratch buffer here + act_sizes[12] = B * T * 4*C; + } else { + act_sizes[12] = L * B * T * 4*C; + } + act_sizes[13] = L * B * T * C; // fcproj act_sizes[14] = L * B * T * C; // residual3 act_sizes[15] = B * T * C; // lnf @@ -1897,6 +1904,7 @@ typedef struct { floatX* cpu_losses; // CPU buffer to copy the losses to, allocated with cudaMallocHost unsigned long long rng_state; // the RNG state for seeding stochastic rounding etc. int use_master_weights; + int recompute_activations; } GPT2; void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) { @@ -1977,6 +1985,7 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) { model->mean_loss = -1.0f; // -1.0f will designate no loss model->rng_state = 13371337; model->use_master_weights = 1; // keep master weights copy in float for optim update? + model->recompute_activations = 0; } void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, int grad_accum_steps=1) { @@ -2012,7 +2021,7 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, in model->batch_size = B; model->seq_len = T; // allocate the space - fill_in_activation_sizes(model->act_sizes, B, T, model->config); + fill_in_activation_sizes(model->act_sizes, B, T, model->config, model->recompute_activations); size_t num_activations = 0; for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) { num_activations += model->act_sizes[i]; @@ -2075,7 +2084,12 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, in floatX* l_ln2_mean = acts.ln2_mean + l * B * T; floatX* l_ln2_rstd = acts.ln2_rstd + l * B * T; floatX* l_fch = acts.fch + l * B * T * 4*C; - floatX* l_fch_gelu = acts.fch_gelu + l * B * T * 4*C; + floatX* l_fch_gelu; + if(model->recompute_activations) { + l_fch_gelu = acts.fch_gelu; // reuse the same buffer for every layer + } else { + l_fch_gelu = acts.fch_gelu + l * B * T * 4*C; + } floatX* l_fcproj = acts.fcproj + l * B * T * C; floatX* l_residual3 = acts.residual3 + l * B * T * C; @@ -2249,6 +2263,10 @@ void gpt2_backward(GPT2 *model) { floatX* dl_bt4c = (floatX*)grads_acts.bt4c; // backprop this layer + if(model->recompute_activations) { + l_fch_gelu = acts.fch_gelu; + gelu_forward(l_fch_gelu, l_fch, B*T*4*C); + } matmul_backward(dl_bt4c, dl_fcprojw, dl_fcprojb, dresidual, l_fch_gelu, l_fcprojw, scratchF, B, T, 4*C, C); gelu_backward(dl_bt4c, l_fch, dl_bt4c, B*T*4*C); matmul_backward(dl_btc, dl_fcw, dl_fcb, dl_bt4c, l_ln2, l_fcw, scratchF, B, T, C, 4 * C); @@ -2588,6 +2606,7 @@ void error_usage() { fprintf(stderr, " -f enable_tf32 override (default: 1, set to 0 to disable tf32)\n"); fprintf(stderr, " -w keep f32 copy of weights for the optimizer? (default: 1)\n"); fprintf(stderr, " -z zero_stage, Zero Optimization Stage, 0,1,2,3 (default = 0)\n"); + fprintf(stderr, " -r Recompute some activations to save memory\n"); exit(EXIT_FAILURE); } @@ -2612,6 +2631,7 @@ int main(int argc, char *argv[]) { int max_steps = -1; int override_enable_tf32 = 1; int use_master_weights = 1; + int recompute_activations = 0; int zero_stage = 0; // Zero Optimization Stage for Multi-GPU training for (int i = 1; i < argc; i+=2) { if (i + 1 >= argc) { error_usage(); } // must have arg after flag @@ -2634,6 +2654,7 @@ int main(int argc, char *argv[]) { else if (argv[i][1] == 'f') { override_enable_tf32 = atoi(argv[i+1]); } else if (argv[i][1] == 'w') { use_master_weights = atoi(argv[i+1]); } else if (argv[i][1] == 'z') { zero_stage = atoi(argv[i+1]); } + else if (argv[i][1] == 'r') { recompute_activations = atoi(argv[i+1]); } else { error_usage(); } } // calculate a sensible default for total batch size by assuming no gradient accumulation @@ -2654,6 +2675,7 @@ int main(int argc, char *argv[]) { printf0("| genT | %-50d |\n", genT); printf0("| overfit_single_batch | %-50d |\n", overfit_single_batch); printf0("| use_master_weights | %-50s |\n", use_master_weights ? "enabled" : "disabled"); + printf0("| recompute_activations | %-50s |\n", recompute_activations ? "enabled" : "disabled"); printf0("+-----------------------+----------------------------------------------------+\n"); common_start(override_enable_tf32, false); // common init code for train/test/profile @@ -2670,6 +2692,7 @@ int main(int argc, char *argv[]) { GPT2 model; gpt2_build_from_checkpoint(&model, load_filename); model.use_master_weights = use_master_weights; + model.recompute_activations = recompute_activations; printf0("| load_filename | %-50s |\n", load_filename); printf0("| max_sequence_length T | %-50d |\n", model.config.max_seq_len); printf0("| vocab_size V | %-50d |\n", model.config.vocab_size); From 57f70ea66b4dc2859c9dea7ce677f68297d252e4 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Thu, 16 May 2024 11:17:55 +0300 Subject: [PATCH 084/172] simplify multi-gpu logic by reducing #ifdefs --- test_gpt2.cu | 5 +++- train_gpt2.cu | 79 +++++++++++++-------------------------------------- 2 files changed, 24 insertions(+), 60 deletions(-) diff --git a/test_gpt2.cu b/test_gpt2.cu index 654e35db1..631357476 100644 --- a/test_gpt2.cu +++ b/test_gpt2.cu @@ -83,6 +83,7 @@ float* float_cpu_malloc_and_point_parameters(FloatParameterTensors* params, size } int main(int argc, char *argv[]) { + multi_gpu_config = multi_gpu_config_init(&argc, &argv); common_start(false, true); // set the right paths @@ -119,6 +120,8 @@ int main(int argc, char *argv[]) { printf("batch_size: %d\n", B); printf("seq_len: %d\n", T); + set_zero_configs(&multi_gpu_config, 0, model.num_parameters); + // read reference information from the file saved from Python/PyTorch side // 1) input x and y int* x = (int*)mallocCheck(B * T * sizeof(int)); @@ -263,7 +266,7 @@ int main(int argc, char *argv[]) { allok = allok & check_tensor(tensors1[15], tensors2[15], C, "lnfb", 3e-2f); } - gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.01f, step+1); + gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.01f, step+1, &multi_gpu_config); // print the timing information at the end printf("step %d: loss %f (took %f ms)\n", step+1, model.mean_loss, time_elapsed_s * 1000); diff --git a/train_gpt2.cu b/train_gpt2.cu index b53f7712c..a8fe6a995 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -458,28 +458,26 @@ void set_zero_configs(MultiGpuConfig* multi_gpu_config, int zero_stage, size_t t multi_gpu_config->shard_num_parameters = total_parameters; multi_gpu_config->shard_offset = 0; -#ifdef MULTI_GPU - // Check the Zero Stage and define sharding parameters - if (zero_stage == 0) { - printf0("| Zero Optimization is disabled |\n"); - } - else if (zero_stage == 1) { - if (total_parameters % multi_gpu_config->num_processes != 0) { - printf0("| Zero Optimization is disabled, Can't equally partition parameters |\n"); - multi_gpu_config->zero_stage = 0; - } - else { - printf0("| Zero Stage1 is enabled |\n"); - multi_gpu_config->zero_stage = 1; - multi_gpu_config->shard_num_parameters = total_parameters / multi_gpu_config->num_processes; - multi_gpu_config->shard_offset = multi_gpu_config->process_rank * (total_parameters / multi_gpu_config->num_processes); - } - } - else{ - printf0("| Disabling Zero Optimization, Zero Stage2 and Stage3 are not yet supported |\n"); + // Check the Zero Stage and define sharding parameters + if (zero_stage == 0) { + printf0("| Zero Optimization is disabled |\n"); + } + else if (zero_stage == 1) { + if (total_parameters % multi_gpu_config->num_processes != 0) { + printf0("| Zero Optimization is disabled, Can't equally partition parameters |\n"); multi_gpu_config->zero_stage = 0; } -#endif + else { + printf0("| Zero Stage1 is enabled |\n"); + multi_gpu_config->zero_stage = 1; + multi_gpu_config->shard_num_parameters = total_parameters / multi_gpu_config->num_processes; + multi_gpu_config->shard_offset = multi_gpu_config->process_rank * multi_gpu_config->shard_num_parameters; + } + } + else{ + printf0("| Disabling Zero Optimization, Zero Stage2 and Stage3 are not yet supported |\n"); + multi_gpu_config->zero_stage = 0; + } } // ---------------------------------------------------------------------------- @@ -2306,40 +2304,7 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) { #endif } -void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t) { - NVTX_RANGE_FN(); - // reference: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html - - // lazily allocate the memory for m_memory and v_memory - if (model->m_memory == NULL) { - cudaCheck(cudaMalloc((void**)&model->m_memory, model->num_parameters * sizeof(float))); - cudaCheck(cudaMalloc((void**)&model->v_memory, model->num_parameters * sizeof(float))); - cudaCheck(cudaMemset(model->m_memory, 0, model->num_parameters * sizeof(float))); - cudaCheck(cudaMemset(model->v_memory, 0, model->num_parameters * sizeof(float))); - printf0("allocated %zu MiB for AdamW optimizer state m\n", (model->num_parameters * sizeof(float)) >> 20); - printf0("allocated %zu MiB for AdamW optimizer state v\n", (model->num_parameters * sizeof(float)) >> 20); - if (model->use_master_weights == 1) { - // allocate one more buffer to keep the master copy of weights as float, and copy the weights over - cudaCheck(cudaMalloc((void**)&model->master_weights, model->num_parameters * sizeof(float))); - copy_and_cast_kernel<<num_parameters, 512), 512>>>(model->master_weights, (floatX*)model->params_memory, model->num_parameters); - cudaCheck(cudaGetLastError()); - printf0("allocated %zu MiB for master copy of params\n", (model->num_parameters * sizeof(float)) >> 20); - } - } - - int block_size = 512; - int num_blocks = CEIL_DIV(model->num_parameters, block_size); - float beta1_correction = 1.0f - powf(beta1, t); - float beta2_correction = 1.0f - powf(beta2, t); - unsigned int seed = random_u32(&model->rng_state); - adamw_kernel3<<>>((floatX*)model->params_memory, model->master_weights, - (floatX*)model->grads_memory, model->m_memory, model->v_memory, - model->num_parameters, - learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed); - cudaCheck(cudaGetLastError()); -} - -void gpt2_multi_gpu_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t, MultiGpuConfig* multi_gpu_config) { +void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t, MultiGpuConfig* multi_gpu_config) { NVTX_RANGE_FN(); size_t num_parameters = multi_gpu_config->shard_num_parameters; floatX* params_memory = (floatX*)model->params_memory + multi_gpu_config->shard_offset; @@ -2828,13 +2793,9 @@ int main(int argc, char *argv[]) { // this is esp important to do here in multigpu update below, where model.mean_loss gets allreduced model.mean_loss = lossf; // update the parameters -#ifndef MULTI_GPU - gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1); -#else gpt2_multi_gpu_accumulate(&model, &multi_gpu_config); - gpt2_multi_gpu_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1, &multi_gpu_config); + gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1, &multi_gpu_config); gpt2_multi_gpu_gather(&model, &multi_gpu_config); -#endif // zero out the gradients for the next iteration gpt2_zero_grad(&model); cudaEventRecord(end); From 8b57cf65355c453d394231bfca478ef2d270bda5 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Thu, 16 May 2024 14:12:29 +0300 Subject: [PATCH 085/172] reduce communication overhead for ZERO stage 1 --- train_gpt2.cu | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index a8fe6a995..04095e4d9 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -2295,12 +2295,20 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) { if (multi_gpu_config->num_processes == 1) { return; } // Average all losses. model->accumulated_mean_loss = multi_gpu_cpu_float_mean(model->mean_loss, multi_gpu_config); - // Average all gradients. - ncclCheck(ncclAllReduce(model->grads_memory, model->grads_memory, - model->num_parameters, - ncclFloatX, ncclAvg, - multi_gpu_config->nccl_comm, - 0)); + if(multi_gpu_config->zero_stage == 0) { + // no ZERO == standard DDP: Average all gradients. + ncclCheck(ncclAllReduce(model->grads_memory, model->grads_memory, + model->num_parameters, + ncclFloatX, ncclAvg, + multi_gpu_config->nccl_comm, 0)); + } else if (multi_gpu_config->zero_stage == 1) { + // ZERO-1: Get average gradient for local shard + floatX* local_grads_memory = (floatX*) model->grads_memory + multi_gpu_config->shard_offset; + ncclCheck(ncclReduceScatter(model->grads_memory, local_grads_memory, + multi_gpu_config->shard_num_parameters, + ncclFloatX, ncclAvg, + multi_gpu_config->nccl_comm, 0)); + } #endif } From fbd8f03eead34791550b4aaf8add5e2775e0db75 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Thu, 16 May 2024 14:25:06 +0300 Subject: [PATCH 086/172] fixup profiling --- profile_gpt2.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/profile_gpt2.cu b/profile_gpt2.cu index c29cd6a08..4b24c8973 100644 --- a/profile_gpt2.cu +++ b/profile_gpt2.cu @@ -27,7 +27,8 @@ the profile.ncu-rep from a cloud box to local to pretty view. #define TESTING #include "train_gpt2.cu" -int main() { +int main(int argc, char *argv[]) { + multi_gpu_config = multi_gpu_config_init(&argc, &argv); common_start(true, true); // build the GPT-2 model from a checkpoint @@ -53,7 +54,7 @@ int main() { gpt2_forward(&model, x, y, B, T); gpt2_zero_grad(&model); gpt2_backward(&model); - gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1); + gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1, &multi_gpu_config); cudaCheck(cudaDeviceSynchronize()); // finish all CUDA work to get correct precise timings // free From d7581fc5428b02d71b954ba9f9f073627c4d0a83 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 16 May 2024 19:07:39 +0000 Subject: [PATCH 087/172] make recompute be an int instead of bool, so we can strengthen it over time just like ZeRO stages, as we recompute more and more of the model in the future possibly. and make it default on because it is awesome --- train_gpt2.cu | 46 +++++++++++++++++++--------------------------- 1 file changed, 19 insertions(+), 27 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index 608034add..a6dcb7d5b 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -1766,7 +1766,7 @@ typedef struct { floatX* output; } ActivationTensors; -void fill_in_activation_sizes(size_t* act_sizes, size_t B, size_t T, GPT2Config config, bool recompute) { +void fill_in_activation_sizes(size_t* act_sizes, size_t B, size_t T, GPT2Config config, int recompute) { size_t Vp = config.padded_vocab_size; size_t L = config.num_layers; size_t NH = config.num_heads; @@ -1788,14 +1788,8 @@ void fill_in_activation_sizes(size_t* act_sizes, size_t B, size_t T, GPT2Config act_sizes[9] = L * B * T; // ln2_mean act_sizes[10] = L * B * T; // ln2_rstd act_sizes[11] = L * B * T * 4*C; // fch - // fch_gelu; result of a pointwise op, we may want to recompute to save activation memory - if (recompute) { - // if we recompute gelus, we just use the scratch buffer here - act_sizes[12] = B * T * 4*C; - } else { - act_sizes[12] = L * B * T * 4*C; - } - + // if recompute >= 1 then we will recompute gelu_forward during backward and use this as scratch buffer + act_sizes[12] = (recompute == 0) ? L * B * T * 4*C : B * T * 4*C; act_sizes[13] = L * B * T * C; // fcproj act_sizes[14] = L * B * T * C; // residual3 act_sizes[15] = B * T * C; // lnf @@ -1904,7 +1898,7 @@ typedef struct { floatX* cpu_losses; // CPU buffer to copy the losses to, allocated with cudaMallocHost unsigned long long rng_state; // the RNG state for seeding stochastic rounding etc. int use_master_weights; - int recompute_activations; + int recompute; } GPT2; void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) { @@ -1985,7 +1979,7 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) { model->mean_loss = -1.0f; // -1.0f will designate no loss model->rng_state = 13371337; model->use_master_weights = 1; // keep master weights copy in float for optim update? - model->recompute_activations = 0; + model->recompute = 1; // default to recompute gelu during backward } void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, int grad_accum_steps=1) { @@ -2021,7 +2015,7 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, in model->batch_size = B; model->seq_len = T; // allocate the space - fill_in_activation_sizes(model->act_sizes, B, T, model->config, model->recompute_activations); + fill_in_activation_sizes(model->act_sizes, B, T, model->config, model->recompute); size_t num_activations = 0; for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) { num_activations += model->act_sizes[i]; @@ -2084,12 +2078,9 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, in floatX* l_ln2_mean = acts.ln2_mean + l * B * T; floatX* l_ln2_rstd = acts.ln2_rstd + l * B * T; floatX* l_fch = acts.fch + l * B * T * 4*C; - floatX* l_fch_gelu; - if(model->recompute_activations) { - l_fch_gelu = acts.fch_gelu; // reuse the same buffer for every layer - } else { - l_fch_gelu = acts.fch_gelu + l * B * T * 4*C; - } + // reuse the same activation buffer at each layer, as we'll re-compute the gelu during backward + // very useful because we dramatically reduce VRAM usage, and may be able to fit larger batch size + floatX* l_fch_gelu = (model->recompute == 0) ? acts.fch_gelu + l * B * T * 4*C : acts.fch_gelu; floatX* l_fcproj = acts.fcproj + l * B * T * C; floatX* l_residual3 = acts.residual3 + l * B * T * C; @@ -2252,7 +2243,7 @@ void gpt2_backward(GPT2 *model) { floatX* l_ln2_mean = acts.ln2_mean + l * B * T; floatX* l_ln2_rstd = acts.ln2_rstd + l * B * T; floatX* l_fch = acts.fch + l * B * T * 4*C; - floatX* l_fch_gelu = acts.fch_gelu + l * B * T * 4*C; + floatX* l_fch_gelu = (model->recompute == 0) ? acts.fch_gelu + l * B * T * 4*C : acts.fch_gelu; // get the pointers of the gradients of the activations for this layer // notice that there is no l *, because we just have a single copy, and keep // re-using this memory in every Transformer block as we calculate backward pass @@ -2262,9 +2253,10 @@ void gpt2_backward(GPT2 *model) { floatX* dl_btc = (floatX*)acts.lnf; floatX* dl_bt4c = (floatX*)grads_acts.bt4c; - // backprop this layer - if(model->recompute_activations) { - l_fch_gelu = acts.fch_gelu; + // start the backward pass for this layer + if(model->recompute >= 1) { + // recompute >= 1 means we recompute gelu. in this case, + // l_fch_gelu is just a buffer, so re-compute the gelu from l_fch here gelu_forward(l_fch_gelu, l_fch, B*T*4*C); } matmul_backward(dl_bt4c, dl_fcprojw, dl_fcprojb, dresidual, l_fch_gelu, l_fcprojw, scratchF, B, T, 4*C, C); @@ -2606,7 +2598,7 @@ void error_usage() { fprintf(stderr, " -f enable_tf32 override (default: 1, set to 0 to disable tf32)\n"); fprintf(stderr, " -w keep f32 copy of weights for the optimizer? (default: 1)\n"); fprintf(stderr, " -z zero_stage, Zero Optimization Stage, 0,1,2,3 (default = 0)\n"); - fprintf(stderr, " -r Recompute some activations to save memory\n"); + fprintf(stderr, " -r recompute: saves memory at cost of speed. (default = 1), 0 = none. 1 = recompute gelu\n"); exit(EXIT_FAILURE); } @@ -2631,7 +2623,7 @@ int main(int argc, char *argv[]) { int max_steps = -1; int override_enable_tf32 = 1; int use_master_weights = 1; - int recompute_activations = 0; + int recompute = 1; // recompute during backward setting, 0 = none, 1 = recompute gelu int zero_stage = 0; // Zero Optimization Stage for Multi-GPU training for (int i = 1; i < argc; i+=2) { if (i + 1 >= argc) { error_usage(); } // must have arg after flag @@ -2654,7 +2646,7 @@ int main(int argc, char *argv[]) { else if (argv[i][1] == 'f') { override_enable_tf32 = atoi(argv[i+1]); } else if (argv[i][1] == 'w') { use_master_weights = atoi(argv[i+1]); } else if (argv[i][1] == 'z') { zero_stage = atoi(argv[i+1]); } - else if (argv[i][1] == 'r') { recompute_activations = atoi(argv[i+1]); } + else if (argv[i][1] == 'r') { recompute = atoi(argv[i+1]); } else { error_usage(); } } // calculate a sensible default for total batch size by assuming no gradient accumulation @@ -2675,7 +2667,7 @@ int main(int argc, char *argv[]) { printf0("| genT | %-50d |\n", genT); printf0("| overfit_single_batch | %-50d |\n", overfit_single_batch); printf0("| use_master_weights | %-50s |\n", use_master_weights ? "enabled" : "disabled"); - printf0("| recompute_activations | %-50s |\n", recompute_activations ? "enabled" : "disabled"); + printf0("| recompute | %-50d |\n", recompute); printf0("+-----------------------+----------------------------------------------------+\n"); common_start(override_enable_tf32, false); // common init code for train/test/profile @@ -2692,7 +2684,7 @@ int main(int argc, char *argv[]) { GPT2 model; gpt2_build_from_checkpoint(&model, load_filename); model.use_master_weights = use_master_weights; - model.recompute_activations = recompute_activations; + model.recompute = recompute; printf0("| load_filename | %-50s |\n", load_filename); printf0("| max_sequence_length T | %-50d |\n", model.config.max_seq_len); printf0("| vocab_size V | %-50d |\n", model.config.vocab_size); From 3113656e3f1009264e1c2315947685d87e02f769 Mon Sep 17 00:00:00 2001 From: Anthony Blake Date: Thu, 16 May 2024 13:30:45 -0700 Subject: [PATCH 088/172] Add link to AMD fork --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 41cef25af..469326fbf 100644 --- a/README.md +++ b/README.md @@ -342,6 +342,9 @@ Lastly, I will be a lot more sensitive to complexity in the root folder of the p ## notable forks +- AMD support + - [llm.c](https://github.com/anthonix/llm.c) by @[anthonix](https://github.com/anthonix): support for AMD devices, such as the 7900 XTX + - C# - [llm.cs](https://github.com/azret/llm.cs) by @[azret](https://github.com/azret): a C# port of this project - [Llm.cs](https://github.com/nietras/Llm.cs) by @[nietras](https://github.com/nietras): a C# port of this project with focus on easy to get started on any platform. Clone and run ✅ From 0f23723ae4f5916cfcc3367a59b2fa5b12a018e5 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Fri, 17 May 2024 00:01:38 +0300 Subject: [PATCH 089/172] joined optimizer state allocation --- train_gpt2.cu | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index 18f08990b..d49d27d6f 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -2340,12 +2340,11 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo floatX* grads_memory = (floatX*)model->grads_memory + multi_gpu_config->shard_offset; if (model->m_memory == NULL) { - cudaCheck(cudaMalloc((void**)&model->m_memory, num_parameters * sizeof(float))); - cudaCheck(cudaMalloc((void**)&model->v_memory, num_parameters * sizeof(float))); - cudaCheck(cudaMemset(model->m_memory, 0, num_parameters * sizeof(float))); - cudaCheck(cudaMemset(model->v_memory, 0, num_parameters * sizeof(float))); - printf0("allocated %zu MiB for AdamW optimizer state m\n", (num_parameters * sizeof(float)) >> 20); - printf0("allocated %zu MiB for AdamW optimizer state v\n", (num_parameters * sizeof(float)) >> 20); + size_t alloc_bytes = 2 * num_parameters * sizeof(float); + cudaCheck(cudaMalloc((void**)&model->m_memory, alloc_bytes)); + model->v_memory = model->m_memory + num_parameters; + cudaCheck(cudaMemset(model->m_memory, 0, alloc_bytes)); + printf0("allocated %zu MiB for AdamW optimizer state\n", alloc_bytes >> 20); if (model->use_master_weights == 1) { cudaCheck(cudaMalloc((void**)&model->master_weights, num_parameters * sizeof(float))); copy_and_cast_kernel<<>>(model->master_weights, params_memory, num_parameters); From 88c3bea890bbee54901ae571b4a887b91546f504 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Fri, 17 May 2024 00:57:09 +0300 Subject: [PATCH 090/172] print message before actual allocation for more informative OOM behaviour --- train_gpt2.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index d49d27d6f..91fe1ca72 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -2030,8 +2030,8 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, in num_activations += model->act_sizes[i]; } model->num_activations = num_activations; + printf0("allocating %d MiB for activations\n", (int)round(num_activations * sizeof(floatX) / (1024 * 1024))); model->acts_memory = malloc_and_point_activations(&model->acts, model->act_sizes); - printf0("allocated %d MiB for activations\n", (int)round(num_activations * sizeof(floatX) / (1024 * 1024))); // also create memory for caching inputs and targets cudaCheck(cudaMalloc((void**)&model->inputs, B * T * sizeof(int))); cudaCheck(cudaMalloc((void**)&model->targets, B * T * sizeof(int))); @@ -2167,19 +2167,19 @@ void gpt2_backward(GPT2 *model) { // lazily allocate the memory for gradients of the weights and activations, if needed if (model->grads_memory == NULL) { // allocate buffers for weight gradients + printf0("allocating %d MiB for parameter gradients\n", (int)round(model->num_parameters * sizeof(floatX) / (1024 * 1024))); model->grads_memory = malloc_and_point_parameters(&model->grads, model->param_elements, model->param_sizeof); - printf0("allocated %d MiB for parameter gradients\n", (int)round(model->num_parameters * sizeof(floatX) / (1024 * 1024))); // we're going to be clever for the activations backward pass. we don't need to exactly // mirror the forward pass activations and we will save memory. size_t bw_act_sizes[NUM_ACTIVATION_TENSORS]; fill_in_grad_act_sizes(bw_act_sizes, model->batch_size, model->seq_len, model->config); // count up and allocate the space - model->grads_acts_memory = malloc_and_point_backward(&model->grads_acts, bw_act_sizes); model->num_grad_acts = 0; for (size_t i = 0; i < NUM_BACKWARD_TENSORS; i++) { model->num_grad_acts += bw_act_sizes[i]; } - printf0("allocated %d MiB for activation gradients\n", (int)round(model->num_grad_acts * sizeof(floatX) / (1024 * 1024))); + printf0("allocating %d MiB for activation gradients\n", (int)round(model->num_grad_acts * sizeof(floatX) / (1024 * 1024))); + model->grads_acts_memory = malloc_and_point_backward(&model->grads_acts, bw_act_sizes); // init gradients of parameters and activations to zero gpt2_zero_grad(model); } @@ -2341,15 +2341,15 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo if (model->m_memory == NULL) { size_t alloc_bytes = 2 * num_parameters * sizeof(float); + printf0("allocating %zu MiB for AdamW optimizer state\n", alloc_bytes >> 20); cudaCheck(cudaMalloc((void**)&model->m_memory, alloc_bytes)); model->v_memory = model->m_memory + num_parameters; cudaCheck(cudaMemset(model->m_memory, 0, alloc_bytes)); - printf0("allocated %zu MiB for AdamW optimizer state\n", alloc_bytes >> 20); if (model->use_master_weights == 1) { + printf0("allocating %zu MiB for master copy of params\n", (num_parameters * sizeof(float)) >> 20); cudaCheck(cudaMalloc((void**)&model->master_weights, num_parameters * sizeof(float))); copy_and_cast_kernel<<>>(model->master_weights, params_memory, num_parameters); cudaCheck(cudaGetLastError()); - printf0("allocated %zu MiB for master copy of params\n", (num_parameters * sizeof(float)) >> 20); } } From b24279c4db2beb9bb962cdd7b87782db5df7dd32 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Fri, 17 May 2024 01:15:47 +0300 Subject: [PATCH 091/172] remove duplicate workspace allocation --- train_gpt2.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index 91fe1ca72..afc1f7ba0 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -2409,8 +2409,6 @@ void common_start(bool override_enable_tf32 = true, bool print_device_info = tru bool enable_tf32 = PRECISION_MODE == PRECISION_FP32 && deviceProp.major >= 8 && override_enable_tf32; cublasCheck(cublasSetMathMode(cublas_handle, enable_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH)); cublas_compute = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F; - // setup the (global) cuBLASLt workspace - cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size)); create_cudnn(); } From c8fa7a8c63a110f0b74746210c5dfc18717758e7 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Fri, 17 May 2024 15:13:36 +0000 Subject: [PATCH 092/172] revert the adamw allocation to previous. minor --- train_gpt2.cu | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index afc1f7ba0..030c5c9b7 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -2340,11 +2340,12 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo floatX* grads_memory = (floatX*)model->grads_memory + multi_gpu_config->shard_offset; if (model->m_memory == NULL) { - size_t alloc_bytes = 2 * num_parameters * sizeof(float); - printf0("allocating %zu MiB for AdamW optimizer state\n", alloc_bytes >> 20); - cudaCheck(cudaMalloc((void**)&model->m_memory, alloc_bytes)); - model->v_memory = model->m_memory + num_parameters; - cudaCheck(cudaMemset(model->m_memory, 0, alloc_bytes)); + printf0("allocating %zu MiB for AdamW optimizer state m\n", (num_parameters * sizeof(float)) >> 20); + printf0("allocating %zu MiB for AdamW optimizer state v\n", (num_parameters * sizeof(float)) >> 20); + cudaCheck(cudaMalloc((void**)&model->m_memory, num_parameters * sizeof(float))); + cudaCheck(cudaMalloc((void**)&model->v_memory, num_parameters * sizeof(float))); + cudaCheck(cudaMemset(model->m_memory, 0, num_parameters * sizeof(float))); + cudaCheck(cudaMemset(model->v_memory, 0, num_parameters * sizeof(float))); if (model->use_master_weights == 1) { printf0("allocating %zu MiB for master copy of params\n", (num_parameters * sizeof(float)) >> 20); cudaCheck(cudaMalloc((void**)&model->master_weights, num_parameters * sizeof(float))); From bf36a4b6b5be081bb675dca0224a61fcba542769 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Sat, 18 May 2024 13:27:35 +0300 Subject: [PATCH 093/172] improved numerical error checking: tighter tolarances relative tolerance based of bf16 epsilon less verbose output if all is OK --- test_gpt2.cu | 60 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/test_gpt2.cu b/test_gpt2.cu index 631357476..862a641e3 100644 --- a/test_gpt2.cu +++ b/test_gpt2.cu @@ -8,11 +8,15 @@ int check_tensor(float *a, float *b, int n, const char* label, float threshold=1 int ok = 1; float max_diff = 0.0f; float max_rel_error = 0.0f; + float max_to_threshold = 0.f; float max_a = 0.0f; float max_b = 0.0f; - printf("%s\n", label); + float epsilon = 0.079; // BF16 epsilon value + printf("%8s: ", label); for (int i = 0; i < n; i++) { + float t_eff = threshold + fabs(b[i]) * epsilon; float diff = fabsf(a[i] - b[i]); + max_to_threshold = max(max_to_threshold, diff / t_eff); if (diff > max_diff) { max_diff = diff; float denom = fabsf(b[i]); @@ -20,21 +24,27 @@ int check_tensor(float *a, float *b, int n, const char* label, float threshold=1 max_a = a[i]; max_b = b[i]; } - if (diff <= threshold) { - if (i < print_upto) { printf("OK "); } - } else { - if (i < print_upto) { printf("NOT OK "); } + if (diff > t_eff) { ok = 0; } - if (i < print_upto) { printf("%f %f\n", a[i], b[i]); } } // print the final result if (ok) { - printf("TENSOR OK, max diff: %e, with rel error: %e (calculated=%f, ref=%f)\n", - max_diff, max_rel_error, max_a, max_b); + printf("TENSOR OK, max diff: %.3e, with rel error: %.3e (calculated=%10f, ref=%10f), %.2f%% of maximum error\n", + max_diff, max_rel_error, max_a, max_b, max_to_threshold*100); } else { - printf("TENSOR NOT OK, max diff: %e, with rel error: %e (calculated=%f, ref=%f)\n", - max_diff, max_rel_error, max_a, max_b); + printf("TENSOR NOT OK, max diff: %.3e, with rel error: %.3e (calculated=%10f, ref=%10f), %.2f%% of maximum error\n", + max_diff, max_rel_error, max_a, max_b, max_to_threshold*100); + } + + if(ok == 0) { + for (int i = 0; i < print_upto; i++) { + float t_eff = threshold + fabs(b[i]) * epsilon; + float diff = fabsf(a[i] - b[i]); + printf(diff <= threshold ? "OK " : "NOT OK "); + printf("%f %f\n", a[i], b[i]); + } + printf("\n"); } return ok; } @@ -248,22 +258,24 @@ int main(int argc, char *argv[]) { // Also, if code changes and some of these get tripped, it could be ok if it's not by too much, // because our use of stochastic rounding is adding some non-determinism "pepper noise". // In that case it's ok to extend the tolerance by a bit, after a manual review. - allok = allok & check_tensor(tensors1[0], tensors2[0], V * C, "wte", 8e-1f); - allok = allok & check_tensor(tensors1[1], tensors2[1], maxT * C, "wpe", 1e-2f); - allok = allok & check_tensor(tensors1[2], tensors2[2], L * 3*C * C, "qkvw", 1.4e-1); // hmm a bit high - allok = allok & check_tensor(tensors1[3], tensors2[3], L * 3*C, "qkvb", 4e-2f); - allok = allok & check_tensor(tensors1[4], tensors2[4], L * C * C, "attprojw", 3e-2f); + // Also, different GPUs may use different matrix multiplication algorithms, so the + // actual errors can be hardware specific. + allok = allok & check_tensor(tensors1[0], tensors2[0], V * C, "wte", 4e-1f); // hmm a bit high + allok = allok & check_tensor(tensors1[1], tensors2[1], maxT * C, "wpe", 4e-3f); + allok = allok & check_tensor(tensors1[2], tensors2[2], L * 3*C * C, "qkvw", 1e-1); // hmm a bit high + allok = allok & check_tensor(tensors1[3], tensors2[3], L * 3*C, "qkvb", 3.5e-2f); + allok = allok & check_tensor(tensors1[4], tensors2[4], L * C * C, "attprojw", 2e-2f); allok = allok & check_tensor(tensors1[5], tensors2[5], L * C, "attprojb", 3e-2f); - allok = allok & check_tensor(tensors1[6], tensors2[6], L * 4*C * C, "fcw", 9e-2f); // hmm a bit high - allok = allok & check_tensor(tensors1[7], tensors2[7], L * 4*C, "fcb", 9e-2f); // hmm a bit high - allok = allok & check_tensor(tensors1[8], tensors2[8], L * C * 4*C, "fcprojw", 9e-2f); // hmm a bit high - allok = allok & check_tensor(tensors1[9], tensors2[9], L * C, "fcprojb", 3e-2f); - allok = allok & check_tensor(tensors1[10], tensors2[10], L * C, "ln1w", 0.1f); // hmm bit higher - allok = allok & check_tensor(tensors1[11], tensors2[11], L * C, "ln1b", 3e-2f); - allok = allok & check_tensor(tensors1[12], tensors2[12], L * C, "ln2w", 0.1f); // hmm bit higher - allok = allok & check_tensor(tensors1[13], tensors2[13], L * C, "ln2b", 3e-2f); + allok = allok & check_tensor(tensors1[6], tensors2[6], L * 4*C * C, "fcw", 5e-2f); // hmm a bit high + allok = allok & check_tensor(tensors1[7], tensors2[7], L * 4*C, "fcb", 5e-2f); // hmm a bit high + allok = allok & check_tensor(tensors1[8], tensors2[8], L * C * 4*C, "fcprojw", 5e-2f); // hmm a bit high + allok = allok & check_tensor(tensors1[9], tensors2[9], L * C, "fcprojb", 1.5e-2f); + allok = allok & check_tensor(tensors1[10], tensors2[10], L * C, "ln1w", 6e-4f); + allok = allok & check_tensor(tensors1[11], tensors2[11], L * C, "ln1b", 9e-3f); + allok = allok & check_tensor(tensors1[12], tensors2[12], L * C, "ln2w", 2e-3f); + allok = allok & check_tensor(tensors1[13], tensors2[13], L * C, "ln2b", 2.5e-3f); allok = allok & check_tensor(tensors1[14], tensors2[14], C, "lnfw", 0.12f); // hmm bit higher - allok = allok & check_tensor(tensors1[15], tensors2[15], C, "lnfb", 3e-2f); + allok = allok & check_tensor(tensors1[15], tensors2[15], C, "lnfb", 2e-2f); } gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.01f, step+1, &multi_gpu_config); From 4374360015c93143acc261d96e951f4be8e41330 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sat, 18 May 2024 18:34:24 +0000 Subject: [PATCH 094/172] adjust wte upper bound a bit, and print always because this part is really tricky and i don't trust anything other than manual inspection, even if we pass, allegedly --- test_gpt2.cu | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/test_gpt2.cu b/test_gpt2.cu index 862a641e3..84701b039 100644 --- a/test_gpt2.cu +++ b/test_gpt2.cu @@ -12,7 +12,8 @@ int check_tensor(float *a, float *b, int n, const char* label, float threshold=1 float max_a = 0.0f; float max_b = 0.0f; float epsilon = 0.079; // BF16 epsilon value - printf("%8s: ", label); + printf("---\n"); + printf("checking tensor: %s\n", label); for (int i = 0; i < n; i++) { float t_eff = threshold + fabs(b[i]) * epsilon; float diff = fabsf(a[i] - b[i]); @@ -27,6 +28,11 @@ int check_tensor(float *a, float *b, int n, const char* label, float threshold=1 if (diff > t_eff) { ok = 0; } + // print the first few elements so we can visually assess the "proof" of the comparison + if (i < print_upto) { + printf(diff <= t_eff ? "OK " : "NOT OK "); + printf("%f %f\n", a[i], b[i]); + } } // print the final result if (ok) { @@ -36,16 +42,6 @@ int check_tensor(float *a, float *b, int n, const char* label, float threshold=1 printf("TENSOR NOT OK, max diff: %.3e, with rel error: %.3e (calculated=%10f, ref=%10f), %.2f%% of maximum error\n", max_diff, max_rel_error, max_a, max_b, max_to_threshold*100); } - - if(ok == 0) { - for (int i = 0; i < print_upto; i++) { - float t_eff = threshold + fabs(b[i]) * epsilon; - float diff = fabsf(a[i] - b[i]); - printf(diff <= threshold ? "OK " : "NOT OK "); - printf("%f %f\n", a[i], b[i]); - } - printf("\n"); - } return ok; } @@ -260,7 +256,7 @@ int main(int argc, char *argv[]) { // In that case it's ok to extend the tolerance by a bit, after a manual review. // Also, different GPUs may use different matrix multiplication algorithms, so the // actual errors can be hardware specific. - allok = allok & check_tensor(tensors1[0], tensors2[0], V * C, "wte", 4e-1f); // hmm a bit high + allok = allok & check_tensor(tensors1[0], tensors2[0], V * C, "wte", 6e-1f); // hmm a bit high allok = allok & check_tensor(tensors1[1], tensors2[1], maxT * C, "wpe", 4e-3f); allok = allok & check_tensor(tensors1[2], tensors2[2], L * 3*C * C, "qkvw", 1e-1); // hmm a bit high allok = allok & check_tensor(tensors1[3], tensors2[3], L * 3*C, "qkvb", 3.5e-2f); From 44d45bdd6a2d6ded079ea1fe762b61bb0889faba Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Wed, 1 May 2024 04:24:54 +0300 Subject: [PATCH 095/172] first draft for gradient clipping by global norm --- profile_gpt2cu.py | 2 ++ train_gpt2.cu | 83 +++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 83 insertions(+), 2 deletions(-) diff --git a/profile_gpt2cu.py b/profile_gpt2cu.py index d9dbd4f8e..4113d7819 100644 --- a/profile_gpt2cu.py +++ b/profile_gpt2cu.py @@ -50,6 +50,8 @@ # model config CLS_START = -1 CLS_NUM = 6 +NORM_ID = 44 +ADAM_ID = 45 N_LAYERS = 12 summaries = defaultdict(lambda: 0.0) diff --git a/train_gpt2.cu b/train_gpt2.cu index 030c5c9b7..cfec532c1 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -1151,11 +1151,31 @@ __device__ float lerp(float start, float end, float weight) { template __global__ void adamw_kernel3(Tp* params_memory, float* master_params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters, float learning_rate, float beta1, float beta2, float beta1_correction, float beta2_correction, float eps, float weight_decay, + float* grad_norm, float max_grad_norm, unsigned int seed) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= num_parameters) { return; } // guard + + float scale = 1.f; + if(!isfinite(*grad_norm)) { + // if we had a numerical problem (e.g, overflow) + // in our gradient calculation, don't mess up the + // existing weights. + // TODO increase a global counter somewhere so we actually know if/how often this happens + if(threadIdx.x == 0 && blockIdx.x == 0) { + printf("[WARNING] weight update skipped due to non-finite gradients!\n"); + } + return; + } + if(*grad_norm > max_grad_norm) { + scale = max_grad_norm / *grad_norm; + // TODO just for debugging, remove this + if(threadIdx.x == 0 && blockIdx.x == 0) { + printf("[scale %f]\n", scale); + } + } // get the gradient, m, and v for this parameter - float grad = (float)grads_memory[idx]; + float grad = scale * (float)grads_memory[idx]; float m = m_memory[idx]; float v = v_memory[idx]; // update the first moment (momentum) @@ -1180,6 +1200,40 @@ __global__ void adamw_kernel3(Tp* params_memory, float* master_params_memory, Tg if (master_params_memory != NULL) { master_params_memory[idx] = param; } } +template +__global__ void norm_kernel(float* out, const T* data, size_t count) { + // we want as few atomics as possible, so each block tries to do + // the maximum amount of work (so no fixed chunk, but instead iterating + // until we run out of data), and then we reduce inside the block + // and finally have just one atomic per block. + // TODO write a second version that just spams atomics in dev/cuda, + // often they are surprisingly fast + namespace cg = cooperative_groups; + cg::thread_block block = cg::this_thread_block(); + cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block); + + __shared__ float block_result[32]; + + // out will be updated atomically from all thread blocks + size_t index = threadIdx.x + blockDim.x * blockIdx.x; + size_t grid_width = blockDim.x * gridDim.x; + float accumulator = 0.f; + for(size_t i = index; i < count; i += grid_width) { + accumulator += (float)data[i] * (float)data[i]; + } + // warp-level reduce + float warp_result = cg::reduce(warp, accumulator, cg::plus{}); + block_result[warp.meta_group_rank()] = warp_result; + block.sync(); + if(warp.meta_group_rank() == 0) { + float gather = warp.thread_rank() < warp.meta_group_size() ? block_result[warp.thread_rank()] : 0.f; + float block_sum = cg::reduce(warp, gather, cg::plus{}); + if(warp.thread_rank() == 0) { + atomicAdd(out, block_sum); + } + } +} + struct SoftmaxParams { float Scale; float Offset; @@ -1656,6 +1710,20 @@ void fused_classifier(Type* logits, Type* losses, cudaCheck(cudaGetLastError()); } +template +void norm(float* out, const T* values, size_t count) { + const int block_size = 512; + // launch just enough blocks to fill the grid. deliberately no DIV_CEIL. + // having one block less than possible is a tiny performance hit, having + // one block too many is catastrophic, since it only can start once all the other + // blocks finish. anyway, I think cuda_threads_per_SM should be a multiple of 512 + // on all gpus, so the division really is going to be exact. + const int grid_size = cuda_threads_per_SM * cuda_num_SMs / block_size; + assert(grid_size > 0); // gives a better error than letting the call below fail + norm_kernel<<>>(out, values, count); + cudaCheck(cudaGetLastError()); +} + // ---------------------------------------------------------------------------- // GPT-2 model definition @@ -2354,14 +2422,25 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo } } + // repurposing this buffer. We calculate the gradient norm on the GPU, and need it in the next kernel, + // so we _really_ don't want to transfer it here as an actual float. So we just pass around a pointer + // to this memory that is not otherwise needed during the update phase. + float* grad_norm = (float*)model->acts.output; + + // global gradient norm + norm(grad_norm, (floatX*)model->grads_memory, model->num_parameters); + int block_size = 512; int num_blocks = CEIL_DIV(num_parameters, block_size); float beta1_correction = 1.0f - powf(beta1, t); float beta2_correction = 1.0f - powf(beta2, t); + float max_grad_norm = 1.f; // TODO figure out a good value unsigned int seed = random_u32(&model->rng_state); adamw_kernel3<<>>(params_memory, model->master_weights, grads_memory, model->m_memory, model->v_memory, num_parameters, - learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed); + learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, + grad_norm, max_grad_norm, + seed); cudaCheck(cudaGetLastError()); } From d7a81ef26fdd2b67f56d922612e9942927ee2ebd Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Wed, 1 May 2024 13:35:54 +0300 Subject: [PATCH 096/172] added a useful mixed precision utility for dev/cuda --- dev/cuda/Makefile | 3 +- dev/cuda/global_norm.cu | 199 ++++++++++++++++++++++++++++++++++++++++ train_gpt2.cu | 8 +- 3 files changed, 204 insertions(+), 6 deletions(-) create mode 100644 dev/cuda/global_norm.cu diff --git a/dev/cuda/Makefile b/dev/cuda/Makefile index c74178851..14eae201e 100644 --- a/dev/cuda/Makefile +++ b/dev/cuda/Makefile @@ -18,7 +18,7 @@ MPI_PATHS = -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux- $(NVCC) $(CFLAGS) $(NVCCFLAGS) $< -o $@ # Build all targets -TARGETS = adamw attention_backward attention_forward classifier_fused crossentropy_forward crossentropy_softmax_backward encoder_backward encoder_forward gelu_backward gelu_forward layernorm_backward layernorm_forward matmul_backward matmul_backward_bias matmul_forward nccl_all_reduce residual_forward softmax_forward trimat_forward fused_residual_forward +TARGETS = adamw attention_backward attention_forward classifier_fused crossentropy_forward crossentropy_softmax_backward encoder_backward encoder_forward gelu_backward gelu_forward layernorm_backward layernorm_forward matmul_backward matmul_backward_bias matmul_forward nccl_all_reduce residual_forward softmax_forward trimat_forward fused_residual_forward global_norm all: $(TARGETS) # Individual targets: forward pass @@ -48,6 +48,7 @@ matmul_backward: matmul_backward.cu # Update kernels adamw: adamw.cu +global_norm: global_norm.cu # NCCL communication kernels nccl_all_reduce: nccl_all_reduce.cu diff --git a/dev/cuda/global_norm.cu b/dev/cuda/global_norm.cu new file mode 100644 index 000000000..434c343f2 --- /dev/null +++ b/dev/cuda/global_norm.cu @@ -0,0 +1,199 @@ +/* +Kernels for a global norm. +Global norm in this context means that we want to calculate a single norm cooperatively using all avalailable SMs, instead + of multiple norms that can be handled by separate blocks. + +Compile example: +nvcc -O3 --use_fast_math global_norm.cu -o global_norm + +version 1 uses as few blocks as possible to still fill the GPU, and only does atomic adds in the end +./gelu_forward 1 + +version 2 is the same but with only warp-wide reduction inside the kernel, and more global atomics +./gelu_forward 2 +*/ + +#include "common.h" +#include +#include +#include + +// TODO move this into common.h +// turn on bf16 as default, done up here for now +#define ENABLE_BF16 + +#if defined(ENABLE_BF16) +typedef __nv_bfloat16 floatX; +typedef __nv_bfloat16 floatN; +#elif defined(ENABLE_FP16) +typedef half floatX; +typedef half floatN; +#else +typedef float floatX; +typedef float floatN; +#endif + +typedef Packed128 x128; + +float global_norm_cpu(const float* data, size_t count) { + // accumulate in double so we have an accurate numerical reference + double acc = 0.0; + for(size_t i = 0; i < count; ++i) { + acc += (double)data[i] * (double)data[i]; + } + return (float)acc; +} + + +template +__global__ void norm_kernel1(float* out, const T* data, size_t count) { + // we want as few atomics as possible, so each block tries to do + // the maximum amount of work (so no fixed chunk, but instead iterating + // until we run out of data), and then we reduce inside the block + // and finally have just one atomic per block. + namespace cg = cooperative_groups; + cg::thread_block block = cg::this_thread_block(); + cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block); + + __shared__ float block_result[32]; + + // out will be updated atomically from all thread blocks + size_t index = threadIdx.x + blockDim.x * blockIdx.x; + size_t grid_width = blockDim.x * gridDim.x; + float accumulator = 0.f; + for(size_t i = index; i < count; i += grid_width) { + accumulator += (float)data[i] * (float)data[i]; + } + // warp-level reduce + float warp_result = cg::reduce(warp, accumulator, cg::plus{}); + block_result[warp.meta_group_rank()] = warp_result; + block.sync(); + if(warp.meta_group_rank() == 0) { + float gather = warp.thread_rank() < warp.meta_group_size() ? block_result[warp.thread_rank()] : 0.f; + float block_sum = cg::reduce(warp, gather, cg::plus{}); + if(warp.thread_rank() == 0) { + atomicAdd(out, block_sum); + } + } +} + + + +template +__global__ void norm_kernel2(float* out, const T* data, size_t count) { + // no shared memory; but one atomic per warp instead of per block + namespace cg = cooperative_groups; + cg::thread_block block = cg::this_thread_block(); + cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block); + + // out will be updated atomically from all thread blocks + size_t index = threadIdx.x + blockDim.x * blockIdx.x; + size_t grid_width = blockDim.x * gridDim.x; + float accumulator = 0.f; + for(size_t i = index; i < count; i += grid_width) { + accumulator += (float)data[i] * (float)data[i]; + } + + // warp-level reduce + float warp_result = cg::reduce(warp, accumulator, cg::plus{}); + // and atomic in global buffer + if(warp.thread_rank() == 0) { + atomicAdd(out, warp_result); + } +} + + + +template +void global_norm1(float* out, const T* values, size_t count, int block_size) { + // launch just enough blocks to fill the grid. deliberately no DIV_CEIL. + // having one block less than possible is a tiny performance hit, having + // one block too many is catastrophic, since it only can start once all the other + // blocks finish. anyway, I think cuda_threads_per_SM should be a multiple of 512 + // on all gpus, so the division really is going to be exact. + const int grid_size = cuda_threads_per_SM * cuda_num_SMs / block_size; + assert(grid_size > 0); // gives a better error than letting the call below fail + norm_kernel1<<>>(out, values, count); + cudaCheck(cudaGetLastError()); +} + +template +void global_norm2(float* out, const T* values, size_t count, int block_size) { + // ditto + const int grid_size = cuda_threads_per_SM * cuda_num_SMs / block_size; + assert(grid_size > 0); // gives a better error than letting the call below fail + norm_kernel2<<>>(out, values, count); + cudaCheck(cudaGetLastError()); +} + +void global_norm(int kernel_num, float* out, const floatX* values, size_t count, int block_size) { + switch (kernel_num) { + case 1: + return global_norm1(out, values, count, block_size); + case 2: + return global_norm2(out, values, count, block_size); + } +} + +int main(int argc, const char **argv) { + setup_main(); + + int C = 768; + int L = 12; + + size_t num_params = (size_t)(C * 4*C + C*C) * 2 * L; + + // create host memory of random numbers + float* inp = make_random_float(num_params); + // scale them down + for(size_t i = 0; i < num_params; ++i) { + inp[i] *= 1e-3; + } + + // read kernel_num from command line + int kernel_num = 1; + if (argc > 1) { + kernel_num = atoi(argv[1]); + } + printf("Using kernel %d\n", kernel_num); + + // first check the correctness of the kernel + float out = global_norm_cpu(inp, num_params); + + // move to GPU + float* d_out; + floatX* d_inp; + cudaCheck(cudaMalloc(&d_out, sizeof(float))); + cudaCheck(cudaMalloc(&d_inp, num_params * sizeof(floatX))); + cudaCheck(memcpy_convert(d_inp, inp, num_params)); + + int block_sizes[] = {32, 64, 128, 256, 512, 1024}; + for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) { + int block_size = block_sizes[j]; + printf("Checking block size %d.\n", block_size); + cudaCheck(cudaMemset(d_out, 0, sizeof(float))); + global_norm(kernel_num, d_out, d_inp, num_params, block_size); + validate_result(d_out, &out, "out", 1, 1e-2f); + } + + printf("All results match. Starting benchmarks.\n\n"); + + for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) { + int block_size = block_sizes[j]; + + int repeat_times = 1000; + + float elapsed_time = benchmark_kernel(repeat_times, global_norm, + kernel_num, d_out, d_inp, + num_params, block_size); + size_t memory_ops = num_params * sizeof(floatX); + float memory_bandwidth = memory_ops / elapsed_time / 1e6; + + printf("block_size %4d | time %.4f ms | bandwidth %.2f GB/s\n", block_size, elapsed_time, memory_bandwidth); + } + + // free memory + free(inp); + cudaCheck(cudaFree(d_out)); + cudaCheck(cudaFree(d_inp)); +} \ No newline at end of file diff --git a/train_gpt2.cu b/train_gpt2.cu index cfec532c1..cb006285c 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -1206,8 +1206,6 @@ __global__ void norm_kernel(float* out, const T* data, size_t count) { // the maximum amount of work (so no fixed chunk, but instead iterating // until we run out of data), and then we reduce inside the block // and finally have just one atomic per block. - // TODO write a second version that just spams atomics in dev/cuda, - // often they are surprisingly fast namespace cg = cooperative_groups; cg::thread_block block = cg::this_thread_block(); cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block); @@ -1711,7 +1709,7 @@ void fused_classifier(Type* logits, Type* losses, } template -void norm(float* out, const T* values, size_t count) { +void global_norm(float* out, const T* values, size_t count) { const int block_size = 512; // launch just enough blocks to fill the grid. deliberately no DIV_CEIL. // having one block less than possible is a tiny performance hit, having @@ -1720,7 +1718,7 @@ void norm(float* out, const T* values, size_t count) { // on all gpus, so the division really is going to be exact. const int grid_size = cuda_threads_per_SM * cuda_num_SMs / block_size; assert(grid_size > 0); // gives a better error than letting the call below fail - norm_kernel<<>>(out, values, count); + norm_kernel<<>>(out, values, count); cudaCheck(cudaGetLastError()); } @@ -2428,7 +2426,7 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo float* grad_norm = (float*)model->acts.output; // global gradient norm - norm(grad_norm, (floatX*)model->grads_memory, model->num_parameters); + global_norm(grad_norm, (floatX*)model->grads_memory, model->num_parameters); int block_size = 512; int num_blocks = CEIL_DIV(num_parameters, block_size); From a9947a8315101778de657b2e597c1a2ffc09939e Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Fri, 3 May 2024 00:39:06 +0300 Subject: [PATCH 097/172] added a flag and the missing sqrt; testing now has clipping enabled --- test_gpt2.cu | 22 +++++++++++----------- train_gpt2.cu | 14 ++++++++------ train_gpt2.py | 6 ++++-- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/test_gpt2.cu b/test_gpt2.cu index 84701b039..50a291f18 100644 --- a/test_gpt2.cu +++ b/test_gpt2.cu @@ -274,7 +274,7 @@ int main(int argc, char *argv[]) { allok = allok & check_tensor(tensors1[15], tensors2[15], C, "lnfb", 2e-2f); } - gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.01f, step+1, &multi_gpu_config); + gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.01f, 1.f, step+1, &multi_gpu_config); // print the timing information at the end printf("step %d: loss %f (took %f ms)\n", step+1, model.mean_loss, time_elapsed_s * 1000); @@ -283,16 +283,16 @@ int main(int argc, char *argv[]) { // expected losses are as follows, from Python float expected_losses[10] = { - 5.270007133483887, - 4.059706687927246, - 3.3751230239868164, - 2.8007826805114746, - 2.315382242202759, - 1.8490285873413086, - 1.3946564197540283, - 0.9991465210914612, - 0.6240804195404053, - 0.37651097774505615 + 5.2700, + 4.0607, + 3.3166, + 2.7115, + 2.1702, + 1.6349, + 1.1419, + 0.7038, + 0.3769, + 0.1743 }; // compare diff --git a/train_gpt2.cu b/train_gpt2.cu index cb006285c..088afd69c 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -1168,10 +1168,10 @@ __global__ void adamw_kernel3(Tp* params_memory, float* master_params_memory, Tg return; } if(*grad_norm > max_grad_norm) { - scale = max_grad_norm / *grad_norm; + scale = max_grad_norm / sqrtf(*grad_norm); // TODO just for debugging, remove this if(threadIdx.x == 0 && blockIdx.x == 0) { - printf("[scale %f]\n", scale); + printf("[norm %f]\n", sqrtf(*grad_norm)); } } // get the gradient, m, and v for this parameter @@ -2399,7 +2399,7 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) { #endif } -void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, int t, MultiGpuConfig* multi_gpu_config) { +void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, float grad_clipping, int t, MultiGpuConfig* multi_gpu_config) { NVTX_RANGE_FN(); size_t num_parameters = multi_gpu_config->shard_num_parameters; floatX* params_memory = (floatX*)model->params_memory + multi_gpu_config->shard_offset; @@ -2432,12 +2432,11 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo int num_blocks = CEIL_DIV(num_parameters, block_size); float beta1_correction = 1.0f - powf(beta1, t); float beta2_correction = 1.0f - powf(beta2, t); - float max_grad_norm = 1.f; // TODO figure out a good value unsigned int seed = random_u32(&model->rng_state); adamw_kernel3<<>>(params_memory, model->master_weights, grads_memory, model->m_memory, model->v_memory, num_parameters, learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, - grad_norm, max_grad_norm, + grad_norm, grad_clipping, seed); cudaCheck(cudaGetLastError()); } @@ -2684,6 +2683,7 @@ int main(int argc, char *argv[]) { int use_master_weights = 1; int recompute = 1; // recompute during backward setting, 0 = none, 1 = recompute gelu int zero_stage = 0; // Zero Optimization Stage for Multi-GPU training + float grad_clipping = 1.f; for (int i = 1; i < argc; i+=2) { if (i + 1 >= argc) { error_usage(); } // must have arg after flag if (argv[i][0] != '-') { error_usage(); } // must start with dash @@ -2704,6 +2704,7 @@ int main(int argc, char *argv[]) { else if (argv[i][1] == 'a') { overfit_single_batch = atoi(argv[i+1]); } else if (argv[i][1] == 'f') { override_enable_tf32 = atoi(argv[i+1]); } else if (argv[i][1] == 'w') { use_master_weights = atoi(argv[i+1]); } + else if (argv[i][1] == 'c') { grad_clipping = atof(argv[i+1]); } else if (argv[i][1] == 'z') { zero_stage = atoi(argv[i+1]); } else if (argv[i][1] == 'r') { recompute = atoi(argv[i+1]); } else { error_usage(); } @@ -2719,6 +2720,7 @@ int main(int argc, char *argv[]) { printf0("| sequence length T | %-50d |\n", T); printf0("| total batch size | %-50d |\n", total_batch_size); printf0("| learning rate | %-50e |\n", learning_rate); + printf0("| grad_clipping | %-50e |\n", grad_clipping); printf0("| max_steps | %-50d |\n", max_steps); printf0("| val_loss_every | %-50d |\n", val_loss_every); printf0("| val_max_batches | %-50d |\n", val_max_batches); @@ -2903,7 +2905,7 @@ int main(int argc, char *argv[]) { model.mean_loss = lossf; // update the parameters gpt2_multi_gpu_accumulate(&model, &multi_gpu_config); - gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, step+1, &multi_gpu_config); + gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, grad_clipping, step+1, &multi_gpu_config); gpt2_multi_gpu_gather(&model, &multi_gpu_config); // zero out the gradients for the next iteration gpt2_zero_grad(&model); diff --git a/train_gpt2.py b/train_gpt2.py index c50fc7b61..b57fe432c 100644 --- a/train_gpt2.py +++ b/train_gpt2.py @@ -407,6 +407,7 @@ def print0(*args, **kwargs): parser.add_argument("--batch_size", type=int, default=4, help="batch size, in units of #batch dimensions") parser.add_argument("--sequence_length", type=int, default=64, help="sequence length") parser.add_argument("--total_batch_size", type=int, default=256, help="total desired batch size, in units of #tokens") + parser.add_argument("--grad_clipping", type=float, default=1, help="maximum gradient magnitude") args = parser.parse_args() B, T = args.batch_size, args.sequence_length assert 1 <= T <= 1024 @@ -552,6 +553,7 @@ def get_batch(): if device == "cuda": torch.cuda.reset_peak_memory_stats() timings = [] + norm = -1 # dummy value to print in inference-only mode for step in range(args.num_iterations): t0 = time.time() @@ -575,7 +577,7 @@ def get_batch(): # backward pass if not args.inference_only: loss.backward() - # todo: grad clip here + norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clipping) optimizer.step() optimizer.zero_grad(set_to_none=True) @@ -588,7 +590,7 @@ def get_batch(): t1 = time.time() # the 0th iteration is often an outlier (much slower) => skip logging it tokens_per_second = grad_accum_steps * ddp_world_size * B * T / (t1-t0) - print0(f"iteration {step+1}, loss: {lossf:.4f}, time: {(t1-t0)*1000:.3f}ms, tok/s: {tokens_per_second:.2f}") + print0(f"iteration {step+1}, loss: {lossf:.4f}, time: {(t1-t0)*1000:.3f}ms, tok/s: {tokens_per_second:.2f}, norm: {norm:.3f}") if step > 0 and step > args.num_iterations - 20: timings.append(t1-t0) From c3a3b9daa526eb5cc02b05aaea7451ed3c4f0d53 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Fri, 3 May 2024 01:08:04 +0300 Subject: [PATCH 098/172] fixed profile target --- profile_gpt2.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/profile_gpt2.cu b/profile_gpt2.cu index 4b24c8973..5a6764533 100644 --- a/profile_gpt2.cu +++ b/profile_gpt2.cu @@ -54,7 +54,7 @@ int main(int argc, char *argv[]) { gpt2_forward(&model, x, y, B, T); gpt2_zero_grad(&model); gpt2_backward(&model); - gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1, &multi_gpu_config); + gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1.f, 1, &multi_gpu_config); cudaCheck(cudaDeviceSynchronize()); // finish all CUDA work to get correct precise timings // free From 589ead1e4ffd65db4f2ef2d5a8ea306bd06cd9e9 Mon Sep 17 00:00:00 2001 From: ngc92 <7938269+ngc92@users.noreply.github.com> Date: Wed, 8 May 2024 00:13:09 +0200 Subject: [PATCH 099/172] updated code to adapt to latest changes --- profile_gpt2cu.py | 4 +--- train_gpt2.cu | 27 ++++++++------------------- 2 files changed, 9 insertions(+), 22 deletions(-) diff --git a/profile_gpt2cu.py b/profile_gpt2cu.py index 4113d7819..de2edfda9 100644 --- a/profile_gpt2cu.py +++ b/profile_gpt2cu.py @@ -50,8 +50,6 @@ # model config CLS_START = -1 CLS_NUM = 6 -NORM_ID = 44 -ADAM_ID = 45 N_LAYERS = 12 summaries = defaultdict(lambda: 0.0) @@ -132,7 +130,7 @@ # the classifier part, counts only once pass_name = "cls" phase = "bwd" - elif "adamw" in kernel: + elif "adamw" in kernel or "global_norm" in kernel: # encoder layer or adam pass_name = "opt" # before the first optimizer run, we create weight copies. diff --git a/train_gpt2.cu b/train_gpt2.cu index 088afd69c..05a0ba2fc 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -1201,18 +1201,13 @@ __global__ void adamw_kernel3(Tp* params_memory, float* master_params_memory, Tg } template -__global__ void norm_kernel(float* out, const T* data, size_t count) { +__global__ void global_norm_kernel(float* out, const T* data, size_t count) { // we want as few atomics as possible, so each block tries to do // the maximum amount of work (so no fixed chunk, but instead iterating // until we run out of data), and then we reduce inside the block // and finally have just one atomic per block. - namespace cg = cooperative_groups; - cg::thread_block block = cg::this_thread_block(); - cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block); - - __shared__ float block_result[32]; - - // out will be updated atomically from all thread blocks + // out will be updated atomically from all thread blocks. It is a float, so the + // atomic op is unproblematic size_t index = threadIdx.x + blockDim.x * blockIdx.x; size_t grid_width = blockDim.x * gridDim.x; float accumulator = 0.f; @@ -1220,15 +1215,9 @@ __global__ void norm_kernel(float* out, const T* data, size_t count) { accumulator += (float)data[i] * (float)data[i]; } // warp-level reduce - float warp_result = cg::reduce(warp, accumulator, cg::plus{}); - block_result[warp.meta_group_rank()] = warp_result; - block.sync(); - if(warp.meta_group_rank() == 0) { - float gather = warp.thread_rank() < warp.meta_group_size() ? block_result[warp.thread_rank()] : 0.f; - float block_sum = cg::reduce(warp, gather, cg::plus{}); - if(warp.thread_rank() == 0) { - atomicAdd(out, block_sum); - } + float block_sum = blockReduce(accumulator); + if(threadIdx.x == 0) { + atomicAdd(out, block_sum); } } @@ -1716,9 +1705,9 @@ void global_norm(float* out, const T* values, size_t count) { // one block too many is catastrophic, since it only can start once all the other // blocks finish. anyway, I think cuda_threads_per_SM should be a multiple of 512 // on all gpus, so the division really is going to be exact. - const int grid_size = cuda_threads_per_SM * cuda_num_SMs / block_size; + const int grid_size = deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount / block_size; assert(grid_size > 0); // gives a better error than letting the call below fail - norm_kernel<<>>(out, values, count); + global_norm_kernel<<>>(out, values, count); cudaCheck(cudaGetLastError()); } From 66ce5766e004f9eec34363f4eb8480b3a305ebc6 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Sat, 18 May 2024 23:06:26 +0300 Subject: [PATCH 100/172] fixed up dev/cuda --- dev/cuda/global_norm.cu | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/dev/cuda/global_norm.cu b/dev/cuda/global_norm.cu index 434c343f2..2295c4976 100644 --- a/dev/cuda/global_norm.cu +++ b/dev/cuda/global_norm.cu @@ -5,35 +5,17 @@ Global norm in this context means that we want to calculate a single norm cooper Compile example: nvcc -O3 --use_fast_math global_norm.cu -o global_norm - -version 1 uses as few blocks as possible to still fill the GPU, and only does atomic adds in the end -./gelu_forward 1 - -version 2 is the same but with only warp-wide reduction inside the kernel, and more global atomics -./gelu_forward 2 */ -#include "common.h" + #include #include #include -// TODO move this into common.h // turn on bf16 as default, done up here for now #define ENABLE_BF16 +#include "common.h" -#if defined(ENABLE_BF16) -typedef __nv_bfloat16 floatX; -typedef __nv_bfloat16 floatN; -#elif defined(ENABLE_FP16) -typedef half floatX; -typedef half floatN; -#else -typedef float floatX; -typedef float floatN; -#endif - -typedef Packed128 x128; float global_norm_cpu(const float* data, size_t count) { // accumulate in double so we have an accurate numerical reference @@ -167,7 +149,7 @@ int main(int argc, const char **argv) { cudaCheck(cudaMalloc(&d_inp, num_params * sizeof(floatX))); cudaCheck(memcpy_convert(d_inp, inp, num_params)); - int block_sizes[] = {32, 64, 128, 256, 512, 1024}; + int block_sizes[] = {32, 64, 128, 256, 512, 768, 1024}; for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) { int block_size = block_sizes[j]; printf("Checking block size %d.\n", block_size); From 77b991281ff9db57fa4f8615d2431e76d59ec3fd Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 19 May 2024 11:51:01 +0000 Subject: [PATCH 101/172] add hellaswag reference eval. it looks pretty bad honestly, the examples themselves, but i'm told it is predictive... hmm --- hellaswag.py | 160 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 hellaswag.py diff --git a/hellaswag.py b/hellaswag.py new file mode 100644 index 000000000..a0fea2afa --- /dev/null +++ b/hellaswag.py @@ -0,0 +1,160 @@ +""" +Downloads and evaluates HellaSwag in Python. +This then acts as the reference file for llm.c +https://github.com/rowanz/hellaswag + +Example HellaSwag json item: + +{"ind": 24, "activity_label": "Roof shingle removal", "ctx_a": "A man is sitting on a roof.", "ctx_b": "he", "ctx": "A man is sitting on a roof. he", "split": "val", "split_type": "indomain", "label": 3, "endings": ["is using wrap to wrap a pair of skis.", "is ripping level tiles off.", "is holding a rubik's cube.", "starts pulling up roofing on a roof."], "source_id": "activitynet~v_-JhWjGDPHMY"} + +ind: dataset ID +activity_label: The ActivityNet or WikiHow label for this example +context: There are two formats. The full context is in ctx. When the context ends in an (incomplete) noun phrase, like for ActivityNet, this incomplete noun phrase is in ctx_b, and the context up until then is in ctx_a. This can be useful for models such as BERT that need the last sentence to be complete. However, it's never required. If ctx_b is nonempty, then ctx is the same thing as ctx_a, followed by a space, then ctx_b. +endings: a list of 4 endings. The correct index is given by label (0,1,2, or 3) +split: train, val, or test. +split_type: indomain if the activity label is seen during training, else zeroshot +source_id: Which video or WikiHow article this example came from +""" + +import os +import json +import requests +import tiktoken +from tqdm import tqdm + +import torch +import torch.nn as nn +from torch.nn import functional as F + +from transformers import GPT2LMHeadModel + +DATA_CACHE_DIR = "data" + +hellaswags = { + "train": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_train.jsonl", + "val": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl", + "test": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_test.jsonl", +} + +enc = tiktoken.get_encoding("gpt2") + +def download_file(url: str, fname: str, chunk_size=1024): + """Helper function to download a file from a given url""" + resp = requests.get(url, stream=True) + total = int(resp.headers.get("content-length", 0)) + with open(fname, "wb") as file, tqdm( + desc=fname, + total=total, + unit="iB", + unit_scale=True, + unit_divisor=1024, + ) as bar: + for data in resp.iter_content(chunk_size=chunk_size): + size = file.write(data) + bar.update(size) + +def download(split): + """Downloads HellaSwag DATA_CACHE_DIR""" + os.makedirs(DATA_CACHE_DIR, exist_ok=True) + data_url = hellaswags[split] + data_filename = os.path.join(DATA_CACHE_DIR, f"hellaswag_{split}.jsonl") + if not os.path.exists(data_filename): + print(f"Downloading {data_url} to {data_filename}...") + download_file(data_url, data_filename) + else: + print(f"{data_filename} already exists, skipping download...") + +def render_example(example): + """ + Given the example as a dictionary, render it as three torch tensors: + - tokens (the tokens of context + completion, of size 4xN, as there are always 4 candidates) + - mask (is 1 in the region of the candidate completion, where we evaluate likelihoods) + - label (the index of the correct completion, which we hope has the highest likelihood) + """ + ctx = example["ctx"] + label = example["label"] + endings = example["endings"] + + # gather up all the tokens + ctx_tokens = enc.encode(ctx) + tok_rows = [] + mask_rows = [] + for end in endings: + end_tokens = enc.encode(" " + end) # note: prepending " " because GPT-2 tokenizer + tok_rows.append(ctx_tokens + end_tokens) + mask_rows.append([0]*len(ctx_tokens) + [1]*len(end_tokens)) + + # have to be careful during the collation because the number of tokens in each row can differ + max_len = max(len(row) for row in tok_rows) + tokens = torch.zeros((4, max_len), dtype=torch.long) + mask = torch.zeros((4, max_len), dtype=torch.long) + for i, (tok_row, mask_row) in enumerate(zip(tok_rows, mask_rows)): + tokens[i, :len(tok_row)] = torch.tensor(tok_row) + mask[i, :len(mask_row)] = torch.tensor(mask_row) + + return tokens, mask, label + +def iterate_examples(split): + download(split) + with open(os.path.join(DATA_CACHE_DIR, f"hellaswag_{split}.jsonl"), "r") as f: + for line in f: + example = json.loads(line) + rendered = render_example(example) + yield example, rendered + +@torch.no_grad() +def evaluate(model_type="gpt2-xl", device="cuda"): + + model = GPT2LMHeadModel.from_pretrained(model_type) + model.to(device) + + num_correct = 0 + num_total = 0 + data_it = iterate_examples("val") + for example, (tokens, mask, label) in data_it: + tokens = tokens.to(device) + mask = mask.to(device) + + # get the logits + logits = model(tokens).logits + # evaluate the autoregressive loss at all positions + shift_logits = (logits[..., :-1, :]).contiguous() + shift_tokens = (tokens[..., 1:]).contiguous() + flat_shift_logits = shift_logits.view(-1, shift_logits.size(-1)) + flat_shift_tokens = shift_tokens.view(-1) + shift_losses = F.cross_entropy(flat_shift_logits, flat_shift_tokens, reduction='none') + shift_losses = shift_losses.view(tokens.size(0), -1) + # now get the average loss just for the completion region (where mask == 1), in each row + shift_mask = (mask[..., :-1]).contiguous() + masked_shift_losses = shift_losses * shift_mask + # sum and divide by the number of 1s in the mask + sum_loss = masked_shift_losses.sum(dim=1) + avg_loss = sum_loss / shift_mask.sum(dim=1) + # now we have a loss for each of the 4 completions + # the one with the lowest loss should be the most likely + # to think through more carefully: sum or average? sum is more right probabilistically + use_loss = sum_loss + # ok predict what the model thinks is the most likely completion + pred = use_loss.argmin().item() + + # accumulate stats + num_total += 1 + num_correct += int(pred == label) + print(f"accuracy: {num_correct/num_total:.4f} ({num_correct}/{num_total})") + + # debug: pretty print a few examples, and the losses in each case + if num_total < 10: + print("---") + print(f"Context:\n {example['ctx']}") + print(f"Endings:") + for i, end in enumerate(example["endings"]): + print(f"{i} (loss: {use_loss[i].item():.4f}) {end}") + print(f"predicted: {pred}, actual: {label}") + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("-m", type=str, default="gpt2-xl", help="the model type to use") + parser.add_argument("-d", type=str, default="cuda", help="the device to use") + args = parser.parse_args() + evaluate(args.model_type, args.device) From aec9ce5d0a9f4b819a552b1438303506ed6f2dd8 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 19 May 2024 12:18:02 +0000 Subject: [PATCH 102/172] move hellaswag file to dev --- hellaswag.py => dev/hellaswag.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename hellaswag.py => dev/hellaswag.py (100%) diff --git a/hellaswag.py b/dev/hellaswag.py similarity index 100% rename from hellaswag.py rename to dev/hellaswag.py From 7bd2389c1fe302b59f7ddc7dcacdb5f11e7765e2 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 19 May 2024 14:23:51 +0000 Subject: [PATCH 103/172] add mmlu as well and refine both a bit --- dev/hellaswag.py | 20 +++--- dev/mmlu.py | 156 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 168 insertions(+), 8 deletions(-) create mode 100644 dev/mmlu.py diff --git a/dev/hellaswag.py b/dev/hellaswag.py index a0fea2afa..cdfe364cc 100644 --- a/dev/hellaswag.py +++ b/dev/hellaswag.py @@ -14,6 +14,9 @@ split: train, val, or test. split_type: indomain if the activity label is seen during training, else zeroshot source_id: Which video or WikiHow article this example came from + +gpt2 (124M) = 28.2% +gpt2-xl (1558M) = 39.22% """ import os @@ -28,7 +31,7 @@ from transformers import GPT2LMHeadModel -DATA_CACHE_DIR = "data" +DATA_CACHE_DIR = os.path.join("data", "hellaswag") hellaswags = { "train": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_train.jsonl", @@ -95,23 +98,24 @@ def render_example(example): return tokens, mask, label def iterate_examples(split): + # there are 10,042 examples in total in val + download(split) with open(os.path.join(DATA_CACHE_DIR, f"hellaswag_{split}.jsonl"), "r") as f: for line in f: example = json.loads(line) - rendered = render_example(example) - yield example, rendered + yield example @torch.no_grad() -def evaluate(model_type="gpt2-xl", device="cuda"): +def evaluate(model_type, device): model = GPT2LMHeadModel.from_pretrained(model_type) model.to(device) num_correct = 0 num_total = 0 - data_it = iterate_examples("val") - for example, (tokens, mask, label) in data_it: + for example in iterate_examples("val"): + tokens, mask, label = render_example(example) tokens = tokens.to(device) mask = mask.to(device) @@ -154,7 +158,7 @@ def evaluate(model_type="gpt2-xl", device="cuda"): if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() - parser.add_argument("-m", type=str, default="gpt2-xl", help="the model type to use") - parser.add_argument("-d", type=str, default="cuda", help="the device to use") + parser.add_argument("-m", "--model_type", type=str, default="gpt2", help="the model type to use") + parser.add_argument("-d", "--device", type=str, default="cuda", help="the device to use") args = parser.parse_args() evaluate(args.model_type, args.device) diff --git a/dev/mmlu.py b/dev/mmlu.py new file mode 100644 index 000000000..f15b785b0 --- /dev/null +++ b/dev/mmlu.py @@ -0,0 +1,156 @@ +""" +Downloads and evaluates MMLU in Python. +This then acts as the reference file for llm.c +https://github.com/hendrycks/test + +gpt2 (124M) ~= 25% (chance) +gpt2-xl (1558M) = 27.00% ... +""" + +import os +import requests +import tiktoken +import pandas as pd +from tqdm import tqdm + +import torch +import torch.nn as nn +from torch.nn import functional as F + +from transformers import GPT2LMHeadModel + +DATA_CACHE_DIR = os.path.join("data", "mmlu") + +enc = tiktoken.get_encoding("gpt2") +data_url = "https://people.eecs.berkeley.edu/~hendrycks/data.tar" + +def download_file(url: str, fname: str, chunk_size=1024): + """Helper function to download a file from a given url""" + resp = requests.get(url, stream=True) + total = int(resp.headers.get("content-length", 0)) + with open(fname, "wb") as file, tqdm( + desc=fname, + total=total, + unit="iB", + unit_scale=True, + unit_divisor=1024, + ) as bar: + for data in resp.iter_content(chunk_size=chunk_size): + size = file.write(data) + bar.update(size) + +def download(): + """Downloads MMLU to DATA_CACHE_DIR""" + os.makedirs(DATA_CACHE_DIR, exist_ok=True) + data_filename = os.path.join(DATA_CACHE_DIR, f"data.tar") + if not os.path.exists(data_filename): + print(f"Downloading {data_url} to {data_filename}...") + download_file(data_url, data_filename) + os.system(f"tar -xf {data_filename} -C {DATA_CACHE_DIR}") # untar + # creates a directory "data" inside it, with e.g. data/test/*csv + else: + print(f"{data_filename} already exists, skipping download...") + +def iterate_examples(): + # there are 14,042 examples in total in the test set + + download() + test_dir = os.path.join(DATA_CACHE_DIR, "data", "test") + csv_files = [f for f in os.listdir(test_dir) if f.endswith(".csv")] + for csv_file in csv_files: + csv_path = os.path.join(test_dir, csv_file) + print(csv_path) + df = pd.read_csv(csv_path, header=None) + n = df.shape[0] + for idx in range(n): + example = { + "question": df.iloc[idx, 0], + "endings": [df.iloc[idx, 1], df.iloc[idx, 2], df.iloc[idx, 3], df.iloc[idx, 4]], + "label": df.iloc[idx, 5], + } + yield example + +def render_example(example): + """ + Given the example as a dictionary, render it as three torch tensors: + - tokens (the tokens of context + completion, of size 4xN, as there are always 4 candidates) + - mask (is 1 in the region of the candidate completion, where we evaluate likelihoods) + - label (the index of the correct completion, which we hope has the highest likelihood) + """ + ctx = f"Question: {example['question']}\n\nAnswer:" + ctx_tokens = enc.encode(ctx) + + tok_rows = [] + mask_rows = [] + for end in example["endings"]: + end_tokens = enc.encode(" " + str(end)) # note: prepending " " because GPT-2 tokenizer + tok_rows.append(ctx_tokens + end_tokens) + mask_rows.append([0]*len(ctx_tokens) + [1]*len(end_tokens)) + + # have to be careful during the collation because the number of tokens in each row can differ + max_len = max(len(row) for row in tok_rows) + tokens = torch.zeros((4, max_len), dtype=torch.long) + mask = torch.zeros((4, max_len), dtype=torch.long) + for i, (tok_row, mask_row) in enumerate(zip(tok_rows, mask_rows)): + tokens[i, :len(tok_row)] = torch.tensor(tok_row) + mask[i, :len(mask_row)] = torch.tensor(mask_row) + + label = "ABCD".index(example["label"]) + return tokens, mask, label + +@torch.no_grad() +def evaluate(model_type, device): + + model = GPT2LMHeadModel.from_pretrained(model_type) + model.to(device) + + num_correct = 0 + num_total = 0 + for example in iterate_examples(): + tokens, mask, label = render_example(example) + tokens = tokens.to(device) + mask = mask.to(device) + + # get the logits + logits = model(tokens).logits + # evaluate the autoregressive loss at all positions + shift_logits = (logits[..., :-1, :]).contiguous() + shift_tokens = (tokens[..., 1:]).contiguous() + flat_shift_logits = shift_logits.view(-1, shift_logits.size(-1)) + flat_shift_tokens = shift_tokens.view(-1) + shift_losses = F.cross_entropy(flat_shift_logits, flat_shift_tokens, reduction='none') + shift_losses = shift_losses.view(tokens.size(0), -1) + # now get the average loss just for the completion region (where mask == 1), in each row + shift_mask = (mask[..., :-1]).contiguous() + masked_shift_losses = shift_losses * shift_mask + # sum and divide by the number of 1s in the mask + sum_loss = masked_shift_losses.sum(dim=1) + avg_loss = sum_loss / shift_mask.sum(dim=1) + # now we have a loss for each of the 4 completions + # the one with the lowest loss should be the most likely + # to think through more carefully: sum or average? sum is more right probabilistically + use_loss = sum_loss + # ok predict what the model thinks is the most likely completion + pred = use_loss.argmin().item() + + # accumulate stats + num_total += 1 + num_correct += int(pred == label) + print(f"accuracy: {num_correct/num_total:.4f} ({num_correct}/{num_total})") + + # debug prints + if num_total < 10: + print("---") + print(f"Context:\n {example['question']}") + print(f"Endings:") + for i, end in enumerate(example["endings"]): + print(f"{i} (loss: {use_loss[i].item():.4f}) {end}") + print(f"predicted: {pred}, actual: {label}") + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("-m", "--model_type", type=str, default="gpt2", help="the model type to use") + parser.add_argument("-d", "--device", type=str, default="cuda", help="the device to use") + args = parser.parse_args() + evaluate(args.model_type, args.device) From 4e14b5228528da147bab6e1ff71e76a8775dd73a Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 19 May 2024 15:18:04 +0000 Subject: [PATCH 104/172] report both acc and acc_norm --- dev/hellaswag.py | 24 ++++++++++++++++-------- dev/mmlu.py | 20 +++++++++++++------- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/dev/hellaswag.py b/dev/hellaswag.py index cdfe364cc..0a6d11b70 100644 --- a/dev/hellaswag.py +++ b/dev/hellaswag.py @@ -15,8 +15,13 @@ split_type: indomain if the activity label is seen during training, else zeroshot source_id: Which video or WikiHow article this example came from -gpt2 (124M) = 28.2% -gpt2-xl (1558M) = 39.22% +gpt2 (124M) +- eleuther harness reports acc 28.92%, acc_norm 31.14% (multiple choice style) +- this script: 10042 acc: 0.2820 acc_norm: 0.2839 + +gpt2-xl (1558M) +- eleuther harness reports acc 40.04%, acc_norm 50.89% (multiple choice style) +- this script: 10042 acc: 0.3922 acc_norm: 0.4664 """ import os @@ -109,9 +114,13 @@ def iterate_examples(split): @torch.no_grad() def evaluate(model_type, device): + torch.set_float32_matmul_precision('high') # use tf32 + model = GPT2LMHeadModel.from_pretrained(model_type) model.to(device) + # model = torch.compile(model) + num_correct_norm = 0 num_correct = 0 num_total = 0 for example in iterate_examples("val"): @@ -136,15 +145,14 @@ def evaluate(model_type, device): avg_loss = sum_loss / shift_mask.sum(dim=1) # now we have a loss for each of the 4 completions # the one with the lowest loss should be the most likely - # to think through more carefully: sum or average? sum is more right probabilistically - use_loss = sum_loss - # ok predict what the model thinks is the most likely completion - pred = use_loss.argmin().item() + pred = sum_loss.argmin().item() + pred_norm = avg_loss.argmin().item() # accumulate stats num_total += 1 num_correct += int(pred == label) - print(f"accuracy: {num_correct/num_total:.4f} ({num_correct}/{num_total})") + num_correct_norm += int(pred_norm == label) + print(f"{num_total} acc: {num_correct/num_total:.4f} acc_norm: {num_correct_norm/num_total:.4f}") # debug: pretty print a few examples, and the losses in each case if num_total < 10: @@ -152,7 +160,7 @@ def evaluate(model_type, device): print(f"Context:\n {example['ctx']}") print(f"Endings:") for i, end in enumerate(example["endings"]): - print(f"{i} (loss: {use_loss[i].item():.4f}) {end}") + print(f"{i} (loss: {avg_loss[i].item():.4f}) {end}") print(f"predicted: {pred}, actual: {label}") if __name__ == "__main__": diff --git a/dev/mmlu.py b/dev/mmlu.py index f15b785b0..67520aeb9 100644 --- a/dev/mmlu.py +++ b/dev/mmlu.py @@ -3,8 +3,11 @@ This then acts as the reference file for llm.c https://github.com/hendrycks/test -gpt2 (124M) ~= 25% (chance) +gpt2 (124M) +- this script: 14042 acc: 0.2534 acc_norm: 0.2734 + gpt2-xl (1558M) = 27.00% ... +- this script: 14042 acc: 0.2700 acc_norm: 0.2938 """ import os @@ -101,9 +104,13 @@ def render_example(example): @torch.no_grad() def evaluate(model_type, device): + torch.set_float32_matmul_precision('high') # use tf32 + model = GPT2LMHeadModel.from_pretrained(model_type) model.to(device) + # model = torch.compile(model) + num_correct_norm = 0 num_correct = 0 num_total = 0 for example in iterate_examples(): @@ -128,15 +135,14 @@ def evaluate(model_type, device): avg_loss = sum_loss / shift_mask.sum(dim=1) # now we have a loss for each of the 4 completions # the one with the lowest loss should be the most likely - # to think through more carefully: sum or average? sum is more right probabilistically - use_loss = sum_loss - # ok predict what the model thinks is the most likely completion - pred = use_loss.argmin().item() + pred = sum_loss.argmin().item() + pred_norm = avg_loss.argmin().item() # accumulate stats num_total += 1 num_correct += int(pred == label) - print(f"accuracy: {num_correct/num_total:.4f} ({num_correct}/{num_total})") + num_correct_norm += int(pred_norm == label) + print(f"{num_total} acc: {num_correct/num_total:.4f} acc_norm: {num_correct_norm/num_total:.4f}") # debug prints if num_total < 10: @@ -144,7 +150,7 @@ def evaluate(model_type, device): print(f"Context:\n {example['question']}") print(f"Endings:") for i, end in enumerate(example["endings"]): - print(f"{i} (loss: {use_loss[i].item():.4f}) {end}") + print(f"{i} (loss: {avg_loss[i].item():.4f}) {end}") print(f"predicted: {pred}, actual: {label}") if __name__ == "__main__": From 8d55c4a6acc3bcf94b7ddb47b9aeaaeea520e3f4 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 19 May 2024 15:45:28 +0000 Subject: [PATCH 105/172] fix a bug, we have to be careful to make sure we evaluate loss at the token just before the first completion token, because that is the prediction for the first token, and its accuracy --- dev/hellaswag.py | 6 +++--- dev/mmlu.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dev/hellaswag.py b/dev/hellaswag.py index 0a6d11b70..361a7757a 100644 --- a/dev/hellaswag.py +++ b/dev/hellaswag.py @@ -17,11 +17,11 @@ gpt2 (124M) - eleuther harness reports acc 28.92%, acc_norm 31.14% (multiple choice style) -- this script: 10042 acc: 0.2820 acc_norm: 0.2839 +- this script: 10042 acc: 0.2859 acc_norm: 0.2955 (completion style) gpt2-xl (1558M) - eleuther harness reports acc 40.04%, acc_norm 50.89% (multiple choice style) -- this script: 10042 acc: 0.3922 acc_norm: 0.4664 +- this script: 10042 acc: 0.3842 acc_norm: 0.4893 (completion style) """ import os @@ -138,7 +138,7 @@ def evaluate(model_type, device): shift_losses = F.cross_entropy(flat_shift_logits, flat_shift_tokens, reduction='none') shift_losses = shift_losses.view(tokens.size(0), -1) # now get the average loss just for the completion region (where mask == 1), in each row - shift_mask = (mask[..., :-1]).contiguous() + shift_mask = (mask[..., 1:]).contiguous() # we must shift mask, so we start at the last prompt token masked_shift_losses = shift_losses * shift_mask # sum and divide by the number of 1s in the mask sum_loss = masked_shift_losses.sum(dim=1) diff --git a/dev/mmlu.py b/dev/mmlu.py index 67520aeb9..b61fe9324 100644 --- a/dev/mmlu.py +++ b/dev/mmlu.py @@ -4,10 +4,10 @@ https://github.com/hendrycks/test gpt2 (124M) -- this script: 14042 acc: 0.2534 acc_norm: 0.2734 +- this script: 14042 acc: 0.2557 acc_norm: 0.2721 -gpt2-xl (1558M) = 27.00% ... -- this script: 14042 acc: 0.2700 acc_norm: 0.2938 +gpt2-xl (1558M) +- this script: 14042 acc: 0.2927 acc_norm: 0.3035 """ import os @@ -128,7 +128,7 @@ def evaluate(model_type, device): shift_losses = F.cross_entropy(flat_shift_logits, flat_shift_tokens, reduction='none') shift_losses = shift_losses.view(tokens.size(0), -1) # now get the average loss just for the completion region (where mask == 1), in each row - shift_mask = (mask[..., :-1]).contiguous() + shift_mask = (mask[..., 1:]).contiguous() # we must shift mask, so we start at the last prompt token masked_shift_losses = shift_losses * shift_mask # sum and divide by the number of 1s in the mask sum_loss = masked_shift_losses.sum(dim=1) From 9e645314e65fc44c8d15c81258bddb620cf5f3ca Mon Sep 17 00:00:00 2001 From: Jun Zhang Date: Mon, 20 May 2024 00:00:21 +0800 Subject: [PATCH 106/172] Check if file exists using platform specific commands & Add *.o to gitignore Signed-off-by: Jun Zhang --- .gitignore | 1 + Makefile | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 5e88e4285..f60885e23 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ dev/cuda/matmul_backward_bias dev/cuda/nccl_all_reduce *.obj *.exe +*.o # log files *.log diff --git a/Makefile b/Makefile index 46abdc9a5..c8b555ac2 100644 --- a/Makefile +++ b/Makefile @@ -23,9 +23,15 @@ NVCC_CUDNN = USE_CUDNN ?= 0 # Function to check if a file exists in the PATH +ifneq ($(OS), Windows_NT) define file_exists_in_path - $(shell where $(1) 2>nul || which $(1) 2>/dev/null) + $(which $(1) 2>/dev/null) endef +else +define file_exists_in_path + $(shell where $(1) 2>nul) +endef +endif ifneq ($(CI),true) # if not in CI, then use the GPU query ifndef GPU_COMPUTE_CAPABILITY # set to defaults if: make GPU_COMPUTE_CAPABILITY= From c2d12f725ebd379eabbf8d4533bee51c20faa3fc Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 19 May 2024 17:07:55 +0000 Subject: [PATCH 107/172] small touchups to grad clip --- dev/cuda/global_norm.cu | 11 +++++++---- train_gpt2.cu | 33 ++++++++++++++------------------- train_gpt2.py | 6 +++--- 3 files changed, 24 insertions(+), 26 deletions(-) diff --git a/dev/cuda/global_norm.cu b/dev/cuda/global_norm.cu index 2295c4976..6c2ed0389 100644 --- a/dev/cuda/global_norm.cu +++ b/dev/cuda/global_norm.cu @@ -59,10 +59,15 @@ __global__ void norm_kernel1(float* out, const T* data, size_t count) { } } - - template __global__ void norm_kernel2(float* out, const T* data, size_t count) { + // concrete example for an A100 GPU (108 SMs, 2048 max threads each) + // so there are 2048 * 108 = 221,184 threads total + // say the block_size is 512, then we would launch 432 blocks in total + // say num_params is ~100M, each thread will process ~500 elements + // warps reduce with warp-level reduce, we have 221,184/32 = 6,912 warps + // and then each warp atomicAdd's to global memory, total of 6,912 atomics + // no shared memory; but one atomic per warp instead of per block namespace cg = cooperative_groups; cg::thread_block block = cg::this_thread_block(); @@ -84,8 +89,6 @@ __global__ void norm_kernel2(float* out, const T* data, size_t count) { } } - - template void global_norm1(float* out, const T* values, size_t count, int block_size) { // launch just enough blocks to fill the grid. deliberately no DIV_CEIL. diff --git a/train_gpt2.cu b/train_gpt2.cu index 05a0ba2fc..49b92647a 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -1151,31 +1151,26 @@ __device__ float lerp(float start, float end, float weight) { template __global__ void adamw_kernel3(Tp* params_memory, float* master_params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters, float learning_rate, float beta1, float beta2, float beta1_correction, float beta2_correction, float eps, float weight_decay, - float* grad_norm, float max_grad_norm, + float* grad_norm, float grad_clip, unsigned int seed) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= num_parameters) { return; } // guard - float scale = 1.f; if(!isfinite(*grad_norm)) { // if we had a numerical problem (e.g, overflow) - // in our gradient calculation, don't mess up the - // existing weights. + // in our gradient norm calculation, don't mess up the existing weights. // TODO increase a global counter somewhere so we actually know if/how often this happens - if(threadIdx.x == 0 && blockIdx.x == 0) { + if(threadIdx.x == 0 && blockIdx.x == 0) { printf("[WARNING] weight update skipped due to non-finite gradients!\n"); } return; } - if(*grad_norm > max_grad_norm) { - scale = max_grad_norm / sqrtf(*grad_norm); - // TODO just for debugging, remove this - if(threadIdx.x == 0 && blockIdx.x == 0) { - printf("[norm %f]\n", sqrtf(*grad_norm)); - } - } // get the gradient, m, and v for this parameter - float grad = scale * (float)grads_memory[idx]; + float grad = (float)grads_memory[idx]; + // clip the gradients if their norm surpasses grad_clip + if(*grad_norm > grad_clip) { + grad *= grad_clip / sqrtf(*grad_norm); + } float m = m_memory[idx]; float v = v_memory[idx]; // update the first moment (momentum) @@ -2388,7 +2383,7 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) { #endif } -void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, float grad_clipping, int t, MultiGpuConfig* multi_gpu_config) { +void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, float grad_clip, int t, MultiGpuConfig* multi_gpu_config) { NVTX_RANGE_FN(); size_t num_parameters = multi_gpu_config->shard_num_parameters; floatX* params_memory = (floatX*)model->params_memory + multi_gpu_config->shard_offset; @@ -2425,7 +2420,7 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo adamw_kernel3<<>>(params_memory, model->master_weights, grads_memory, model->m_memory, model->v_memory, num_parameters, learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, - grad_norm, grad_clipping, + grad_norm, grad_clip, seed); cudaCheck(cudaGetLastError()); } @@ -2672,7 +2667,7 @@ int main(int argc, char *argv[]) { int use_master_weights = 1; int recompute = 1; // recompute during backward setting, 0 = none, 1 = recompute gelu int zero_stage = 0; // Zero Optimization Stage for Multi-GPU training - float grad_clipping = 1.f; + float grad_clip = 1.0f; for (int i = 1; i < argc; i+=2) { if (i + 1 >= argc) { error_usage(); } // must have arg after flag if (argv[i][0] != '-') { error_usage(); } // must start with dash @@ -2693,7 +2688,7 @@ int main(int argc, char *argv[]) { else if (argv[i][1] == 'a') { overfit_single_batch = atoi(argv[i+1]); } else if (argv[i][1] == 'f') { override_enable_tf32 = atoi(argv[i+1]); } else if (argv[i][1] == 'w') { use_master_weights = atoi(argv[i+1]); } - else if (argv[i][1] == 'c') { grad_clipping = atof(argv[i+1]); } + else if (argv[i][1] == 'c') { grad_clip = atof(argv[i+1]); } else if (argv[i][1] == 'z') { zero_stage = atoi(argv[i+1]); } else if (argv[i][1] == 'r') { recompute = atoi(argv[i+1]); } else { error_usage(); } @@ -2709,7 +2704,7 @@ int main(int argc, char *argv[]) { printf0("| sequence length T | %-50d |\n", T); printf0("| total batch size | %-50d |\n", total_batch_size); printf0("| learning rate | %-50e |\n", learning_rate); - printf0("| grad_clipping | %-50e |\n", grad_clipping); + printf0("| grad_clip | %-50e |\n", grad_clip); printf0("| max_steps | %-50d |\n", max_steps); printf0("| val_loss_every | %-50d |\n", val_loss_every); printf0("| val_max_batches | %-50d |\n", val_max_batches); @@ -2894,7 +2889,7 @@ int main(int argc, char *argv[]) { model.mean_loss = lossf; // update the parameters gpt2_multi_gpu_accumulate(&model, &multi_gpu_config); - gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, grad_clipping, step+1, &multi_gpu_config); + gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, grad_clip, step+1, &multi_gpu_config); gpt2_multi_gpu_gather(&model, &multi_gpu_config); // zero out the gradients for the next iteration gpt2_zero_grad(&model); diff --git a/train_gpt2.py b/train_gpt2.py index b57fe432c..ab1c3e44d 100644 --- a/train_gpt2.py +++ b/train_gpt2.py @@ -407,7 +407,7 @@ def print0(*args, **kwargs): parser.add_argument("--batch_size", type=int, default=4, help="batch size, in units of #batch dimensions") parser.add_argument("--sequence_length", type=int, default=64, help="sequence length") parser.add_argument("--total_batch_size", type=int, default=256, help="total desired batch size, in units of #tokens") - parser.add_argument("--grad_clipping", type=float, default=1, help="maximum gradient magnitude") + parser.add_argument("--grad_clip", type=float, default=1.0, help="maximum gradient magnitude") args = parser.parse_args() B, T = args.batch_size, args.sequence_length assert 1 <= T <= 1024 @@ -553,7 +553,7 @@ def get_batch(): if device == "cuda": torch.cuda.reset_peak_memory_stats() timings = [] - norm = -1 # dummy value to print in inference-only mode + norm = -1.0 # dummy value to print in inference-only mode for step in range(args.num_iterations): t0 = time.time() @@ -577,7 +577,7 @@ def get_batch(): # backward pass if not args.inference_only: loss.backward() - norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clipping) + norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) optimizer.step() optimizer.zero_grad(set_to_none=True) From bc58cd1dc1aa6a3aa121033616ffc7f6f01871b5 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 19 May 2024 17:51:29 +0000 Subject: [PATCH 108/172] fix small bugs in grad clip, introduce a GPU CPU synch point to communicate the float grad_clip to print it, and small printing changes --- train_gpt2.cu | 65 ++++++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 34 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index 49b92647a..f9cd4e2f7 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -1151,26 +1151,12 @@ __device__ float lerp(float start, float end, float weight) { template __global__ void adamw_kernel3(Tp* params_memory, float* master_params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters, float learning_rate, float beta1, float beta2, float beta1_correction, float beta2_correction, float eps, float weight_decay, - float* grad_norm, float grad_clip, - unsigned int seed) { + float grad_scale, unsigned int seed) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= num_parameters) { return; } // guard - if(!isfinite(*grad_norm)) { - // if we had a numerical problem (e.g, overflow) - // in our gradient norm calculation, don't mess up the existing weights. - // TODO increase a global counter somewhere so we actually know if/how often this happens - if(threadIdx.x == 0 && blockIdx.x == 0) { - printf("[WARNING] weight update skipped due to non-finite gradients!\n"); - } - return; - } // get the gradient, m, and v for this parameter - float grad = (float)grads_memory[idx]; - // clip the gradients if their norm surpasses grad_clip - if(*grad_norm > grad_clip) { - grad *= grad_clip / sqrtf(*grad_norm); - } + float grad = grad_scale * (float)grads_memory[idx]; float m = m_memory[idx]; float v = v_memory[idx]; // update the first moment (momentum) @@ -1196,7 +1182,7 @@ __global__ void adamw_kernel3(Tp* params_memory, float* master_params_memory, Tg } template -__global__ void global_norm_kernel(float* out, const T* data, size_t count) { +__global__ void global_norm_squared_kernel(float* out, const T* data, size_t count) { // we want as few atomics as possible, so each block tries to do // the maximum amount of work (so no fixed chunk, but instead iterating // until we run out of data), and then we reduce inside the block @@ -1693,7 +1679,7 @@ void fused_classifier(Type* logits, Type* losses, } template -void global_norm(float* out, const T* values, size_t count) { +void global_norm_squared(float* out, const T* values, size_t count) { const int block_size = 512; // launch just enough blocks to fill the grid. deliberately no DIV_CEIL. // having one block less than possible is a tiny performance hit, having @@ -1702,7 +1688,9 @@ void global_norm(float* out, const T* values, size_t count) { // on all gpus, so the division really is going to be exact. const int grid_size = deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount / block_size; assert(grid_size > 0); // gives a better error than letting the call below fail - global_norm_kernel<<>>(out, values, count); + // initialize out with zero + cudaCheck(cudaMemset(out, 0, sizeof(float))); + global_norm_squared_kernel<<>>(out, values, count); cudaCheck(cudaGetLastError()); } @@ -2383,7 +2371,7 @@ void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) { #endif } -void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, float grad_clip, int t, MultiGpuConfig* multi_gpu_config) { +float gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, float grad_clip, int t, MultiGpuConfig* multi_gpu_config) { NVTX_RANGE_FN(); size_t num_parameters = multi_gpu_config->shard_num_parameters; floatX* params_memory = (floatX*)model->params_memory + multi_gpu_config->shard_offset; @@ -2404,25 +2392,34 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo } } - // repurposing this buffer. We calculate the gradient norm on the GPU, and need it in the next kernel, - // so we _really_ don't want to transfer it here as an actual float. So we just pass around a pointer - // to this memory that is not otherwise needed during the update phase. - float* grad_norm = (float*)model->acts.output; - - // global gradient norm - global_norm(grad_norm, (floatX*)model->grads_memory, model->num_parameters); + // gradient clipping + // repurposing this buffer (which isn't needed now) to write grad norm into it + float* grad_norm_squared = (float*)model->acts.output; + global_norm_squared(grad_norm_squared, (floatX*)model->grads_memory, model->num_parameters); + // transfer the gradient norm to CPU + float grad_norm_squared_cpu = 0.0f; + cudaCheck(cudaMemcpy(&grad_norm_squared_cpu, grad_norm_squared, sizeof(float), cudaMemcpyDeviceToHost)); + if(!isfinite(grad_norm_squared_cpu)) { + // may happen due to some issue (e.g. overflow?) + // TODO: later may want to keep a global counter of instabilities like this + printf0("[WARNING]: grad norm is not finite, skipping AdamW update\n"); + return -1.0f; + } + float grad_norm_cpu = sqrtf(grad_norm_squared_cpu); + float grad_scale = (grad_norm_cpu > grad_clip) ? grad_clip / grad_norm_cpu : 1.0f; + // AdamW update int block_size = 512; int num_blocks = CEIL_DIV(num_parameters, block_size); float beta1_correction = 1.0f - powf(beta1, t); float beta2_correction = 1.0f - powf(beta2, t); unsigned int seed = random_u32(&model->rng_state); adamw_kernel3<<>>(params_memory, model->master_weights, grads_memory, - model->m_memory, model->v_memory, num_parameters, - learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, - grad_norm, grad_clip, - seed); + model->m_memory, model->v_memory, num_parameters, + learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, + grad_scale, seed); cudaCheck(cudaGetLastError()); + return grad_norm_cpu; } void gpt2_multi_gpu_gather(GPT2 *model, MultiGpuConfig* multi_gpu_config) @@ -2889,7 +2886,7 @@ int main(int argc, char *argv[]) { model.mean_loss = lossf; // update the parameters gpt2_multi_gpu_accumulate(&model, &multi_gpu_config); - gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, grad_clip, step+1, &multi_gpu_config); + float grad_norm = gpt2_update(&model, learning_rate, 0.9f, 0.999f, 1e-8f, 0.0f, grad_clip, step+1, &multi_gpu_config); gpt2_multi_gpu_gather(&model, &multi_gpu_config); // zero out the gradients for the next iteration gpt2_zero_grad(&model); @@ -2911,8 +2908,8 @@ int main(int argc, char *argv[]) { bias_corrected_ema_tokens_per_second = ema_tokens_per_second / (1.0f - powf(0.95f, step)); } float accumulated_loss = multi_gpu_config.num_processes == 1 ? model.mean_loss : model.accumulated_mean_loss; - printf0("step %4d/%d: train loss %f (acc %f) (%f ms, %0f tok/s)\n", - step + 1, train_num_batches, model.mean_loss, accumulated_loss, + printf0("step %4d/%d: train loss %f norm %.4f (%.2f ms, %.0f tok/s)\n", + step + 1, train_num_batches, accumulated_loss, grad_norm, time_elapsed_ms, bias_corrected_ema_tokens_per_second); logger_log_train(&logger, step, model.mean_loss); From 299ce659ec571bd843a10010115d45742df6d285 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Sun, 19 May 2024 22:25:00 +0300 Subject: [PATCH 109/172] initialize multi_gpu_config so profile doesn't crash --- profile_gpt2.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/profile_gpt2.cu b/profile_gpt2.cu index 4b24c8973..1a1ad51b9 100644 --- a/profile_gpt2.cu +++ b/profile_gpt2.cu @@ -49,6 +49,7 @@ int main(int argc, char *argv[]) { // override number of layers to 1 because all layers repeat the same kernels, only profile once model.config.num_layers = 1; + set_zero_configs(&multi_gpu_config, 0, model.num_parameters); // do a training step gpt2_forward(&model, x, y, B, T); From ead5d3597381a71eddb517775bc7509383651b36 Mon Sep 17 00:00:00 2001 From: Christopher Date: Sun, 19 May 2024 22:54:47 +0000 Subject: [PATCH 110/172] Added warpsize as a constant for better compile time optimization and standardization --- train_gpt2.cu | 67 ++++++++++++++++++++++++++------------------------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index 030c5c9b7..741d1a7c8 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -114,6 +114,9 @@ class NvtxRange { #define MAX_1024_THREADS_BLOCKS 1 #endif +// WarpSize is not a compile time constant, this allows the compiler to optimize +#define WARP_SIZE 32U + // cuBLAS workspace. Hardcoding to 32MiB but only Hopper needs 32, for others 4 is OK const size_t cublaslt_workspace_size = 32 * 1024 * 1024; void* cublaslt_workspace = NULL; @@ -203,10 +206,10 @@ template __device__ float blockReduce(float val, bool final_sync=false, float out_of_bounds=0.0f) { // two reductions of up to 1024 threads: // 1) inside warp (shuffle), 2) cross-warp (shared memory), 3) inside warp (shuffle) - __shared__ float shared_val[32]; - const int lane_id = threadIdx.x % 32; - const int warp_id = threadIdx.x / 32; - const int num_warps = blockDim.x / 32; + __shared__ float shared_val[WARP_SIZE]; + const int lane_id = threadIdx.x % WARP_SIZE; + const int warp_id = threadIdx.x / WARP_SIZE; + const int num_warps = blockDim.x / WARP_SIZE; float warp_val = warp_reduction(val); if (lane_id == 0) { shared_val[warp_id] = warp_val; } @@ -578,10 +581,9 @@ __global__ void encoder_backward_kernel(floatX* dwte, floatX* dwpe, __global__ void layernorm_forward_kernel3(floatX* __restrict__ out, floatX* __restrict__ mean, floatX* __restrict__ rstd, const floatX* __restrict__ inp, const floatX* __restrict__ weight, const floatX* __restrict__ bias, int N, int C) { - const int warp_size = 32; - int lane_id = threadIdx.x % warp_size; - int warp_id = threadIdx.x / warp_size; - int num_warps = blockDim.x / warp_size; + int lane_id = threadIdx.x % WARP_SIZE; + int warp_id = threadIdx.x / WARP_SIZE; + int num_warps = blockDim.x / WARP_SIZE; int idx = blockIdx.x * num_warps + warp_id; if(idx >= N) { return; } // guard @@ -591,7 +593,7 @@ __global__ void layernorm_forward_kernel3(floatX* __restrict__ out, floatX* __re // mean float sum = 0.0f; - for (int i = lane_id; i < C; i += warp_size) { + for (int i = lane_id; i < C; i += WARP_SIZE) { sum += (float)x[i]; } sum = warpReduceSum(sum); @@ -602,7 +604,7 @@ __global__ void layernorm_forward_kernel3(floatX* __restrict__ out, floatX* __re // rstd sum = 0.0f; - for (int i = lane_id; i < C; i += warp_size) { + for (int i = lane_id; i < C; i += WARP_SIZE) { float diff = (float)x[i] - m; sum += diff * diff; } @@ -614,7 +616,7 @@ __global__ void layernorm_forward_kernel3(floatX* __restrict__ out, floatX* __re // final normalization and scaling by weight/bias floatX* o = out + idx * C; - for (int c = lane_id; c < C; c += warp_size) { + for (int c = lane_id; c < C; c += WARP_SIZE) { // load and store using the .cs "streaming" hint to the compiler, // indicating that this data will not be reused soon, and can be streamed through the caches // this allows the threads to get more cache-hits for the (shared) weight and bias parameters @@ -627,8 +629,7 @@ __global__ void fused_residual_forward_kernel5(floatX* residual, floatX* normed, const floatX* inp1, const floatX* inp2, const floatX* weight, const floatX* bias, int N, int C) { - constexpr const int WarpSize = 32; - assert(blockDim.x == WarpSize); + assert(blockDim.x == WARP_SIZE); // load weights and biases into shared memory // do this before we allow any threads to exit! @@ -639,8 +640,8 @@ __global__ void fused_residual_forward_kernel5(floatX* residual, floatX* normed, x128* s_bias = reinterpret_cast(params) + (C / x128::size); x128* s_res = reinterpret_cast(params) + ((2 + threadIdx.y) * C / x128::size); - int sidx = (threadIdx.x + WarpSize * threadIdx.y) * x128::size; - for(int i = sidx; i < C; i += blockDim.y * WarpSize * x128::size) { + int sidx = (threadIdx.x + WARP_SIZE * threadIdx.y) * x128::size; + for(int i = sidx; i < C; i += blockDim.y * WARP_SIZE * x128::size) { s_weight[i/x128::size] = load128(weight + i); s_bias[i/x128::size] = load128(bias + i); } @@ -657,7 +658,7 @@ __global__ void fused_residual_forward_kernel5(floatX* residual, floatX* normed, const float eps = 1e-5f; float sum = 0.0f; - for(int c = threadIdx.x * x128::size; c < C; c += WarpSize * x128::size) { + for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) { const x128 in1 = load128cs(inp1 + c); const x128 in2 = load128cs(inp2 + c); x128 out; @@ -673,7 +674,7 @@ __global__ void fused_residual_forward_kernel5(floatX* residual, floatX* normed, float m = sum / C; float v = 0.f; - for(int c = threadIdx.x * x128::size; c < C; c += WarpSize * x128::size) { + for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) { const x128 res = s_res[c / x128::size]; for(int k = 0; k < x128::size; ++k) { v += ((float)res[k] - m) * ((float)res[k] - m); @@ -683,7 +684,7 @@ __global__ void fused_residual_forward_kernel5(floatX* residual, floatX* normed, v = warpReduceSum(v) / C; float s = rsqrtf(v + eps); - for(int c = threadIdx.x * x128::size; c < C; c += WarpSize * x128::size) { + for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) { const x128 res = s_res[c / x128::size]; const x128 w = s_weight[c / x128::size]; const x128 b = s_bias[c / x128::size]; @@ -898,7 +899,7 @@ template __global__ void matmul_backward_bias_kernel9(OutFloat* dbias, const floatX* dout, int B, int T, int OC, std::bool_constant) { constexpr const int bdx = 4; - constexpr const int bdy = 32 / bdx; + constexpr const int bdy = WARP_SIZE / bdx; assert(blockDim.x == bdx); assert(blockDim.y == bdy); @@ -929,7 +930,7 @@ __global__ void matmul_backward_bias_kernel9(OutFloat* dbias, const floatX* dout } } - __shared__ float sub_results[x128::size][32][bdy]; + __shared__ float sub_results[x128::size][WARP_SIZE][bdy]; // reduce within-warp results for (int k = 0; k < x128::size; k++) { @@ -988,12 +989,12 @@ __global__ void __launch_bounds__(512, 3) // todo - any warnings on Turing with const floatX* mean, const floatX* rstd, int B, int T, int C) { extern __shared__ float shared[]; // size = 2 * C + 1 - int warpId = threadIdx.x / warpSize; // warp index within a block - int warpsInBlock = blockDim.x / warpSize; //number of warps in block + int warpId = threadIdx.x / WARP_SIZE; // warp index within a block + int warpsInBlock = blockDim.x / WARP_SIZE; //number of warps in block int baseIdx = blockIdx.x * warpsInBlock + warpId; - int warpThreadIdx = threadIdx.x % warpSize; // Thread index within the warp + int warpThreadIdx = threadIdx.x % WARP_SIZE; // Thread index within the warp int warpsInGrid = gridDim.x * warpsInBlock; - int C_per_iteration = warpSize * x128::size; + int C_per_iteration = WARP_SIZE * x128::size; int iterations_C = C / C_per_iteration; // the first half of shared memory is bias, second is weight @@ -1021,7 +1022,7 @@ __global__ void __launch_bounds__(512, 3) // todo - any warnings on Turing with // first: two reduce operations float dnorm_mean = 0.0f; float dnorm_norm_mean = 0.0f; - for (int i = warpThreadIdx * x128::size; i < C; i += warpSize * x128::size) { + for (int i = warpThreadIdx * x128::size; i < C; i += WARP_SIZE * x128::size) { x128 dout128_i = load128(dout_bt + i); x128 inp128_i = load128(inp_bt + i); x128 weight128_i = load128(weight + i); @@ -1053,9 +1054,9 @@ __global__ void __launch_bounds__(512, 3) // todo - any warnings on Turing with float norm_bti = ((float)inp128[x] - mean_bt) * rstd_bt; float dnorm_i = (float)weight128[x] * dout_i; // gradient contribution to bias (using shared memory friendly index) - atomicAdd(&dbias_shared[shared_index + x*warpSize], dout_i); + atomicAdd(&dbias_shared[shared_index + x*WARP_SIZE], dout_i); // gradient contribution to weight (using shared memory friendly index) - atomicAdd(&dweight_shared[shared_index + x*warpSize], norm_bti * dout_i); + atomicAdd(&dweight_shared[shared_index + x*WARP_SIZE], norm_bti * dout_i); // gradient contribution to input float dval = 0.0f; dval += dnorm_i; // term 1 @@ -1095,8 +1096,8 @@ __global__ void __launch_bounds__(512, 3) // todo - any warnings on Turing with x128 dbias128 = load128(dbias + global_index); x128 dweight128 = load128(dweight + global_index); for (int x = 0; x < x128::size; x++) { - float s_db = scratch_dbias[shared_index + x*warpSize]; - float s_dw = scratch_dweight[shared_index + x*warpSize]; + float s_db = scratch_dbias[shared_index + x*WARP_SIZE]; + float s_dw = scratch_dweight[shared_index + x*WARP_SIZE]; dbias128[x] = (floatX)(s_db + (float)dbias128[x]); dweight128[x] = (floatX)(s_dw + (float)dweight128[x]); } @@ -1351,7 +1352,7 @@ void layernorm_forward(floatX* out, floatX* mean, floatX* rstd, NVTX_RANGE_FN(); const int block_size = 512; const int N = B * T; - const int grid_size = CEIL_DIV(N * 32, block_size); + const int grid_size = CEIL_DIV(N * WARP_SIZE, block_size); layernorm_forward_kernel3<<>>(out, mean, rstd, inp, weight, bias, N, C); cudaCheck(cudaGetLastError()); } @@ -1496,7 +1497,7 @@ void fused_residual_forward5(floatX* residual, floatX* normed, floatX* mean, flo const floatX* weight, const floatX* bias, int N, int C) { const int block_size = 256; - int block_y = block_size / 32; + int block_y = block_size / WARP_SIZE; const int grid_size = CEIL_DIV(N, block_y); size_t smem = (2 + block_y) * C * sizeof(floatX); @@ -1506,7 +1507,7 @@ void fused_residual_forward5(floatX* residual, floatX* normed, floatX* mean, flo auto status = cudaFuncSetAttribute(fused_residual_forward_kernel5, cudaFuncAttributeMaxDynamicSharedMemorySize, smem); cudaGetLastError(); if(status == cudaSuccess) { - fused_residual_forward_kernel5<<>>(residual, normed, mean, rstd, inp1, inp2, + fused_residual_forward_kernel5<<>>(residual, normed, mean, rstd, inp1, inp2, weight, bias, N, C); } else { residual_forward(residual, inp1, inp2, N*C); @@ -1546,7 +1547,7 @@ void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias, const int block_size = deviceProp.maxThreadsPerMultiProcessor == 1536 ? 768 : 1024; - dim3 block_dim = {4, 8, (unsigned)block_size/32}; + dim3 block_dim = {4, 8, (unsigned)block_size/WARP_SIZE}; const int OC_per_warp = block_dim.y * x128::size; // 64 at BF16 const int grid_size_x = CEIL_DIV(OC, OC_per_warp); // e.g. 12 horizontal blocks for 768 OCs at BF16 const int grid_size_y = max(1, deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount / (block_size * grid_size_x)); // full GPU! From 6de1137e0ec531c9660afc4048de09588dc18c5e Mon Sep 17 00:00:00 2001 From: Christopher Date: Sun, 19 May 2024 23:11:57 +0000 Subject: [PATCH 111/172] Moved bounds checks outside of kernel into assertions --- train_gpt2.cu | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index 030c5c9b7..cdffff4eb 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -844,9 +844,8 @@ __global__ void softmax_forward_kernel5(floatX* out, float inv_temperature, cons } } -__global__ void residual_forward_kernel(floatX* out, const floatX* inp1, const floatX* inp2, int N) { +__global__ void residual_forward_kernel(floatX* out, const floatX* inp1, const floatX* inp2) { int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size; - if (idx >= N) { return; } x128 packed_out; x128 packed_inp1 = load128cs(inp1 + idx); @@ -858,9 +857,8 @@ __global__ void residual_forward_kernel(floatX* out, const floatX* inp1, const f } #define GELU_SCALING_FACTOR sqrtf(2.0f / M_PI) -__global__ void gelu_forward_kernel2(floatX* out, const floatX* inp, int N) { +__global__ void gelu_forward_kernel2(floatX* out, const floatX* inp) { int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size; - if (idx >= N) { return; } x128 packed_out; x128 packed_inp = load128cs(inp + idx); // load and do not keep in cache @@ -874,9 +872,8 @@ __global__ void gelu_forward_kernel2(floatX* out, const floatX* inp, int N) { store128(out + idx, packed_out); } -__global__ void gelu_backward_kernel(floatX* dinp, const floatX* inp, const floatX* dout, const int N) { +__global__ void gelu_backward_kernel(floatX* dinp, const floatX* inp, const floatX* dout) { int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size; - if (idx >= N) { return; } x128 packed_dinp; x128 packed_inp = load128cs(inp + idx); @@ -1486,8 +1483,9 @@ void attention_forward(floatX* out, floatX* qkvr, floatX* att, void residual_forward(floatX* out, const floatX* inp1, const floatX* inp2, int N) { NVTX_RANGE_FN(); const int block_size = 256; + assert(N % block_size == 0); const int grid_size = CEIL_DIV(N, block_size * x128::size); - residual_forward_kernel<<>>(out, inp1, inp2, N); + residual_forward_kernel<<>>(out, inp1, inp2); cudaCheck(cudaGetLastError()); } @@ -1519,16 +1517,18 @@ void fused_residual_forward5(floatX* residual, floatX* normed, floatX* mean, flo void gelu_forward(floatX* out, const floatX* inp, int N) { NVTX_RANGE_FN(); const int block_size = 512; + assert(N % block_size == 0); const int grid_size = CEIL_DIV(N, block_size * x128::size); - gelu_forward_kernel2<<>>(out, inp, N); + gelu_forward_kernel2<<>>(out, inp); cudaCheck(cudaGetLastError()); } void gelu_backward(floatX* dinp, const floatX* inp, const floatX* dout, const int N) { NVTX_RANGE_FN(); const int block_size = 128; + assert(N % block_size == 0); const int grid_size = CEIL_DIV(N, block_size * x128::size); - gelu_backward_kernel<<>>(dinp, inp, dout, N); + gelu_backward_kernel<<>>(dinp, inp, dout); cudaCheck(cudaGetLastError()); } From 6348d4196d6857244d7833988c405e44afe578d7 Mon Sep 17 00:00:00 2001 From: lancer Date: Sun, 19 May 2024 17:39:25 -0700 Subject: [PATCH 112/172] fix the unsupported block_size --- dev/cuda/matmul_backward_bias.cu | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu index 12b167083..52d793ac7 100644 --- a/dev/cuda/matmul_backward_bias.cu +++ b/dev/cuda/matmul_backward_bias.cu @@ -421,6 +421,9 @@ __global__ void reduce_add_sum_kernel(floatX* dst, const float* src, size_t n, s // version1: simple cuBLAS calls void matmul_backward_bias1(floatX* dbias, const floatX* dout, int B, int T, int OC, int block_size) { + if (block_size == 768) { + block_size = 1024; // block_size needs to be power of 2 due to the reduction + } dim3 block_dim(block_size); dim3 grid_dim(OC); size_t shared_mem_size = block_size * sizeof(float); From 2b0667aee15151622797d6bc209eec8f4742f3a7 Mon Sep 17 00:00:00 2001 From: lancer Date: Mon, 20 May 2024 08:00:39 -0700 Subject: [PATCH 113/172] update the utils function and assert --- dev/cuda/matmul_backward_bias.cu | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu index 52d793ac7..16172bcf2 100644 --- a/dev/cuda/matmul_backward_bias.cu +++ b/dev/cuda/matmul_backward_bias.cu @@ -27,6 +27,26 @@ sudo ncu --set full --import-source yes -o bias -f ./matmul_backward_bias 1 #define ENABLE_BF16 #include "common.h" + +// ---------------------------------------------------------------------------- +// utility functions +__host__ __device__ bool isPowerOfTwo(int n) { + return (n > 0) && ((n & (n - 1)) == 0); +} + +__host__ __device__ int largestPowerOfTwoLessOrEqual(int n) { + // Return the largest power of 2 less than or equal to n + if (n < 1) { + return 0; + } + + while ((n & (n - 1)) > 0) { + n = n & (n - 1); + } + + return n; +} + // ---------------------------------------------------------------------------- // CPU code reference @@ -421,9 +441,8 @@ __global__ void reduce_add_sum_kernel(floatX* dst, const float* src, size_t n, s // version1: simple cuBLAS calls void matmul_backward_bias1(floatX* dbias, const floatX* dout, int B, int T, int OC, int block_size) { - if (block_size == 768) { - block_size = 1024; // block_size needs to be power of 2 due to the reduction - } + block_size = largestPowerOfTwoLessOrEqual(block_size); + assert(isPowerOfTwo(block_size)); // block_size needs to be power of 2 due to the reduction dim3 block_dim(block_size); dim3 grid_dim(OC); size_t shared_mem_size = block_size * sizeof(float); From 722e5b2fe5a4a9cdcaf7041178737b2e14a91591 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 20 May 2024 22:43:40 +0000 Subject: [PATCH 114/172] refactor how we treat datasets, because we're about to have more of them and we don't want them to clutter up root dir etc. this is only step 1, i'm about to refactor a bunch of the dataloading, how the .bin files work and are loaded, how the DataLoader works, etc. This is all needed to support good evals and training at scale --- .github/workflows/ci.yml | 30 ++++++++--------- README.md | 26 +++++++-------- dev/{ => data}/hellaswag.py | 21 ++---------- dev/{ => data}/mmlu.py | 21 ++---------- .../data/tinyshakespeare.py | 33 +++++-------------- .../data/tinystories.py | 33 ++++++------------- train_gpt2.c | 8 ++--- train_gpt2.cu | 4 +-- train_gpt2.py | 2 +- train_gpt2_fp32.cu | 4 +-- 10 files changed, 61 insertions(+), 121 deletions(-) rename dev/{ => data}/hellaswag.py (92%) rename dev/{ => data}/mmlu.py (90%) rename prepro_tinyshakespeare.py => dev/data/tinyshakespeare.py (70%) rename prepro_tinystories.py => dev/data/tinystories.py (82%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bb19f2ba5..e4c211bc1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,7 +21,7 @@ jobs: uses: actions/checkout@v4 - name: Install OpenMP - if: matrix.os != 'windows-latest' + if: matrix.os != 'windows-latest' run: | if [ "${{ runner.os }}" == "Linux" ]; then sudo apt-get update && sudo apt-get install -y libomp-dev @@ -33,7 +33,7 @@ jobs: run: pip install -r requirements.txt - name: Run preprocessing - run: python prepro_tinyshakespeare.py + run: python dev/data/tinyshakespeare.py - name: Train model run: python train_gpt2.py --device=cpu @@ -45,9 +45,9 @@ jobs: $url = 'https://github.com/maweil/MakeForWindows/releases/download/v4.4.1/make-bin-win64.zip' $output = './make-bin-win64.zip' $wc.DownloadFile($url, $output) - + - name: Unzip Win32 Makefile - if: matrix.os == 'windows-latest' + if: matrix.os == 'windows-latest' run: | unzip make-bin-win64.zip @@ -59,26 +59,26 @@ jobs: if: matrix.os == 'windows-latest' shell: cmd run: | - call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\VC\\Auxiliary\\Build\\vcvars64.bat" + call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\VC\\Auxiliary\\Build\\vcvars64.bat" make-4.4.1\dist\make WIN_CI_BUILD=1 test_gpt2 train_gpt2 - name: Execute testing program (With OpenMP) - if: matrix.os != 'windows-latest' + if: matrix.os != 'windows-latest' run: OMP_NUM_THREADS=8 ./test_gpt2 - - name: Execute Windows testing program (With OpenMP) - if: matrix.os == 'windows-latest' + - name: Execute Windows testing program (With OpenMP) + if: matrix.os == 'windows-latest' shell: cmd run: | copy test_gpt2 test_gpt2.exe - test_gpt2.exe + test_gpt2.exe - name: Compile training and testing program without OpenMP - if: matrix.os != 'windows-latest' + if: matrix.os != 'windows-latest' run: NO_OMP=1 make test_gpt2 train_gpt2 - name: Execute testing program (No OpenMP) - if: matrix.os != 'windows-latest' + if: matrix.os != 'windows-latest' run: ./test_gpt2 build-cuda-windows: @@ -93,11 +93,11 @@ jobs: $url = 'https://github.com/maweil/MakeForWindows/releases/download/v4.4.1/make-bin-win64.zip' $output = './make-bin-win64.zip' $wc.DownloadFile($url, $output) - + - name: Unzip Win32 Makefile run: | unzip make-bin-win64.zip - + - name: Install Cuda Toolkit 12.4 on Windows run: | mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" @@ -134,9 +134,9 @@ jobs: shell: cmd working-directory: ${{ github.workspace }} run: | - call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\VC\\Auxiliary\\Build\\vcvars64.bat" + call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\VC\\Auxiliary\\Build\\vcvars64.bat" make-4.4.1\dist\make -j WIN_CI_BUILD=1 train_gpt2fp32cu test_gpt2fp32cu test_gpt2cu train_gpt2cu profile_gpt2cu - + build-cuda-fp32: runs-on: ubuntu-latest container: diff --git a/README.md b/README.md index 469326fbf..aee282fe0 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ The "I don't care about anything I just want to train and I have a GPU" section. ```bash pip install -r requirements.txt -python prepro_tinyshakespeare.py +python dev/data/tinyshakespeare.py python train_gpt2.py make train_gpt2fp32cu ./train_gpt2fp32cu @@ -22,17 +22,17 @@ The above lines (1) download the [tinyshakespeare](https://raw.githubusercontent ## quick start (GPU, fast bleeding edge) -I want to see it go fast. In this case switch to our mainline, most optimized `train_gpt2.cu` and also turn on flash attention. Run: +I want to see it go fast. In this case switch to our mainline, most optimized `train_gpt2.cu`. Run: ```bash pip install -r requirements.txt -python prepro_tinyshakespeare.py +python dev/data/tinyshakespeare.py python train_gpt2.py make train_gpt2cu ./train_gpt2cu ``` -If you additionally install cuDNN (see the CUDA section below), you can also go faster with flash attention +If you additionally install cuDNN (see the CUDA section below), you can go even faster with flash attention. Adjust the make command as follows to compile with cudnn / flash attention: ```bash make train_gpt2cu USE_CUDNN=1 @@ -48,9 +48,9 @@ Note that the default batch size is very low (4). If you have enough memory on y My standard "prod" run with a nice GPU (e.g. A100 40GB) actually trains on TinyStories instead of TinyShakespeare, and looks like this: ```bash -python prepro_tinystories.py +python dev/data/tinystories.py make train_gpt2cu USE_CUDNN=1 -./train_gpt2cu -i data/TinyStories -v 250 -s 250 -g 144 -o stories.log -b 32 +./train_gpt2cu -i dev/data/tinystories/TinyStories -v 250 -s 250 -g 144 -o stories.log -b 32 ``` Where I decrease the frequency of validation loss and sampling to every 250 steps, sample 144 tokens during sampling stage (to fit ~one story), and at batch size 32. @@ -61,7 +61,7 @@ The "I am so GPU poor that I don't even have one" section. No worries, run: ```bash pip install -r requirements.txt -python prepro_tinyshakespeare.py +python dev/data/tinyshakespeare.py python train_gpt2.py make train_gpt2 OMP_NUM_THREADS=8 ./train_gpt2 @@ -73,10 +73,10 @@ The above lines (1) download the [tinyshakespeare](https://raw.githubusercontent You'll be using the (more bleeding edge) mixed precision version of the code: -``` +```bash sudo apt install openmpi-bin openmpi-doc libopenmpi-dev pip install -r requirements.txt -python prepro_tinyshakespeare.py +python dev/data/tinyshakespeare.py python train_gpt2.py make train_gpt2cu mpirun -np ./train_gpt2cu @@ -89,17 +89,17 @@ Sub in the number of GPUs you'd like to run on in the last command. Download and tokenize a dataset. The [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset is the fastest to download and tokenize: ```bash -python prepro_tinyshakespeare.py +python dev/data/tinyshakespeare.py ``` This prints: ``` -Saved 32768 tokens to data/tiny_shakespeare_val.bin -Saved 305260 tokens to data/tiny_shakespeare_train.bin +Saved 32768 tokens to (...)/tiny_shakespeare_val.bin +Saved 305260 tokens to (...)/tiny_shakespeare_train.bin ``` -The .bin files are raw byte streams of int32 numbers indicating the token ids with the GPT-2 tokenizer. Alternatively you could also tokenize the [TinyStories](https://huggingface.co/datasets/roneneldan/TinyStories) dataset with `prepro_tinystories.py`. +The .bin files are raw byte streams of int32 numbers indicating the token ids with the GPT-2 tokenizer. Alternatively you could also tokenize the [TinyStories](https://huggingface.co/datasets/roneneldan/TinyStories) dataset with `tinystories.py`. In principle we'd be ready to train the model right here. However the baseline CPU/fp32 reference code is so inefficient that it's not practical to train these models from scratch yet. Instead, we initialize with the GPT-2 weights released by OpenAI and just do finetuning. For that, we have to download the GPT-2 weights and save them as a checkpoint we can load in C: diff --git a/dev/hellaswag.py b/dev/data/hellaswag.py similarity index 92% rename from dev/hellaswag.py rename to dev/data/hellaswag.py index 361a7757a..a1c14f591 100644 --- a/dev/hellaswag.py +++ b/dev/data/hellaswag.py @@ -29,14 +29,14 @@ import requests import tiktoken from tqdm import tqdm - import torch import torch.nn as nn from torch.nn import functional as F - from transformers import GPT2LMHeadModel +from data_common import download_file -DATA_CACHE_DIR = os.path.join("data", "hellaswag") +# ----------------------------------------------------------------------------- +DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "hellaswag") hellaswags = { "train": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_train.jsonl", @@ -46,21 +46,6 @@ enc = tiktoken.get_encoding("gpt2") -def download_file(url: str, fname: str, chunk_size=1024): - """Helper function to download a file from a given url""" - resp = requests.get(url, stream=True) - total = int(resp.headers.get("content-length", 0)) - with open(fname, "wb") as file, tqdm( - desc=fname, - total=total, - unit="iB", - unit_scale=True, - unit_divisor=1024, - ) as bar: - for data in resp.iter_content(chunk_size=chunk_size): - size = file.write(data) - bar.update(size) - def download(split): """Downloads HellaSwag DATA_CACHE_DIR""" os.makedirs(DATA_CACHE_DIR, exist_ok=True) diff --git a/dev/mmlu.py b/dev/data/mmlu.py similarity index 90% rename from dev/mmlu.py rename to dev/data/mmlu.py index b61fe9324..bda8855b8 100644 --- a/dev/mmlu.py +++ b/dev/data/mmlu.py @@ -15,33 +15,18 @@ import tiktoken import pandas as pd from tqdm import tqdm - import torch import torch.nn as nn from torch.nn import functional as F - from transformers import GPT2LMHeadModel +from data_common import download_file -DATA_CACHE_DIR = os.path.join("data", "mmlu") +# ----------------------------------------------------------------------------- +DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "mmlu") enc = tiktoken.get_encoding("gpt2") data_url = "https://people.eecs.berkeley.edu/~hendrycks/data.tar" -def download_file(url: str, fname: str, chunk_size=1024): - """Helper function to download a file from a given url""" - resp = requests.get(url, stream=True) - total = int(resp.headers.get("content-length", 0)) - with open(fname, "wb") as file, tqdm( - desc=fname, - total=total, - unit="iB", - unit_scale=True, - unit_divisor=1024, - ) as bar: - for data in resp.iter_content(chunk_size=chunk_size): - size = file.write(data) - bar.update(size) - def download(): """Downloads MMLU to DATA_CACHE_DIR""" os.makedirs(DATA_CACHE_DIR, exist_ok=True) diff --git a/prepro_tinyshakespeare.py b/dev/data/tinyshakespeare.py similarity index 70% rename from prepro_tinyshakespeare.py rename to dev/data/tinyshakespeare.py index a5d562284..6d795aef7 100644 --- a/prepro_tinyshakespeare.py +++ b/dev/data/tinyshakespeare.py @@ -3,11 +3,11 @@ - The download is from Github. - The tokenization is GPT-2 tokenizer with tiktoken -The output is written to a newly created data/ folder. +The output is written to a newly created tinyshakespeare/ folder. The script prints: -Saved 32768 tokens to data/tiny_shakespeare_val.bin -Saved 305260 tokens to data/tiny_shakespeare_train.bin +Saved 32768 tokens to tinyshakespeare/tiny_shakespeare_val.bin +Saved 305260 tokens to tinyshakespeare/tiny_shakespeare_train.bin And runs in a few seconds depending on your internet connection and computer. The .bin files are raw byte @@ -15,36 +15,20 @@ """ import os -import requests -from tqdm import tqdm - import tiktoken import numpy as np +from data_common import download_file + +# ----------------------------------------------------------------------------- +DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "tinyshakespeare") -DATA_CACHE_DIR = "data" enc = tiktoken.get_encoding("gpt2") encode = lambda s: enc.encode(s, allowed_special={'<|endoftext|>'}) -def download_file(url: str, fname: str, chunk_size=1024): - """Helper function to download a file from a given url""" - resp = requests.get(url, stream=True) - total = int(resp.headers.get("content-length", 0)) - with open(fname, "wb") as file, tqdm( - desc=fname, - total=total, - unit="iB", - unit_scale=True, - unit_divisor=1024, - ) as bar: - for data in resp.iter_content(chunk_size=chunk_size): - size = file.write(data) - bar.update(size) - def download(): """Downloads the TinyShakespeare dataset to DATA_CACHE_DIR""" os.makedirs(DATA_CACHE_DIR, exist_ok=True) - - # download the TinyStories dataset, unless it's already downloaded + # download the TinyShakespeare dataset, unless it's already downloaded data_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt" data_filename = os.path.join(DATA_CACHE_DIR, "tiny_shakespeare.txt") if not os.path.exists(data_filename): @@ -54,7 +38,6 @@ def download(): print(f"{data_filename} already exists, skipping download...") def tokenize(): - eot = enc._special_tokens['<|endoftext|>'] # end of text token data_filename = os.path.join(DATA_CACHE_DIR, "tiny_shakespeare.txt") text = open(data_filename, 'r').read() # let's treat every person's statement in the dialog as a separate document diff --git a/prepro_tinystories.py b/dev/data/tinystories.py similarity index 82% rename from prepro_tinystories.py rename to dev/data/tinystories.py index 8f2c1e8ad..628e5a7bb 100644 --- a/prepro_tinystories.py +++ b/dev/data/tinystories.py @@ -3,13 +3,13 @@ - The download is from HuggingFace datasets. - The tokenization is GPT-2 tokenizer with tiktoken -The output is written to a newly created data/ folder. +The output is written to a newly created tinystories/ folder. The script prints: Tokenizing val split... -Saved 19043638 tokens to data/TinyStories_val.bin +Saved 19043638 tokens to tinystories/TinyStories_val.bin Tokenizing train split... -Saved 925653391 tokens to data/TinyStories_train.bin +Saved 925653391 tokens to tinystories/TinyStories_train.bin And runs in 1-2 minutes two depending on your internet connection and computer. The .bin files are raw byte @@ -23,29 +23,16 @@ import requests from tqdm import tqdm from concurrent.futures import ProcessPoolExecutor, as_completed - import tiktoken import numpy as np +from data_common import download_file + +# ----------------------------------------------------------------------------- +DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "tinystories") -DATA_CACHE_DIR = "data" enc = tiktoken.get_encoding("gpt2") encode = lambda s: enc.encode_ordinary(s) -def download_file(url: str, fname: str, chunk_size=1024): - """Helper function to download a file from a given url""" - resp = requests.get(url, stream=True) - total = int(resp.headers.get("content-length", 0)) - with open(fname, "wb") as file, tqdm( - desc=fname, - total=total, - unit="iB", - unit_scale=True, - unit_divisor=1024, - ) as bar: - for data in resp.iter_content(chunk_size=chunk_size): - size = file.write(data) - bar.update(size) - def download(): """Downloads the TinyStories dataset to DATA_CACHE_DIR""" os.makedirs(DATA_CACHE_DIR, exist_ok=True) @@ -70,11 +57,11 @@ def download(): # print a single example just for debugging and such shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json"))) - with open(shard_filenames[0], "r") as f: - data = json.load(f) print("Download done.") print(f"Number of shards: {len(shard_filenames)}") - #print(f"Example story:\n{data[0]}") + # with open(shard_filenames[0], "r") as f: + # data = json.load(f) + # print(f"Example story:\n{data[0]}") def process_shard(shard_index, shard_filename): with open(shard_filename, "r") as f: diff --git a/train_gpt2.c b/train_gpt2.c index 95c46ab86..8fdf46e4b 100644 --- a/train_gpt2.c +++ b/train_gpt2.c @@ -1100,10 +1100,10 @@ int main() { gpt2_build_from_checkpoint(&model, "gpt2_124M.bin"); // build the DataLoaders from tokens files. for now use tiny_shakespeare if available, else tiny_stories - const char* tiny_stories_train = "data/TinyStories_train.bin"; - const char* tiny_stories_val = "data/TinyStories_val.bin"; - const char* tiny_shakespeare_train = "data/tiny_shakespeare_train.bin"; - const char* tiny_shakespeare_val = "data/tiny_shakespeare_val.bin"; + const char* tiny_stories_train = "dev/data/tinystories/TinyStories_train.bin"; + const char* tiny_stories_val = "dev/data/tinystories/TinyStories_val.bin"; + const char* tiny_shakespeare_train = "dev/data/tinyshakespeare/tiny_shakespeare_train.bin"; + const char* tiny_shakespeare_val = "dev/data/tinyshakespeare/tiny_shakespeare_val.bin"; const char* train_tokens = access(tiny_shakespeare_train, F_OK) != -1 ? tiny_shakespeare_train : tiny_stories_train; const char* val_tokens = access(tiny_shakespeare_val, F_OK) != -1 ? tiny_shakespeare_val : tiny_stories_val; int B = 4; // batch size 4 (i.e. 4 independent token sequences will be trained on) diff --git a/train_gpt2.cu b/train_gpt2.cu index 1e8b54be2..0b69574e2 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -2620,7 +2620,7 @@ void error_usage() { // default run = debugging run with TinyShakespeare // bigger run = train on TinyStories! e.g. val/sample less often, but sample more tokens, write to logfile fprintf(stderr, "Usage: ./train_gpt2cu [options]\n"); - fprintf(stderr, "Example: ./train_gpt2cu -i data/TinyStories -v 100 -s 100 -g 144 -o stories.log\n"); + fprintf(stderr, "Example: ./train_gpt2cu -i dev/data/tinystories/TinyStories -v 100 -s 100 -g 144 -o stories.log\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " -i input dataset prefix (default = data/tiny_shakespeare)\n"); fprintf(stderr, " -e input model filename (default = gpt2_124M_bf16.bin)\n"); @@ -2648,7 +2648,7 @@ int main(int argc, char *argv[]) { multi_gpu_config = multi_gpu_config_init(&argc, &argv); // read in the (optional) command line arguments - const char* input_dataset_prefix = "data/tiny_shakespeare"; // or e.g. data/TinyStories + const char* input_dataset_prefix = "dev/data/tinyshakespeare/tiny_shakespeare"; // or e.g. data/TinyStories const char* load_filename = "gpt2_124M_bf16.bin"; // bf16 weights of the model const char* output_log_file = NULL; int B = 4; // batch size diff --git a/train_gpt2.py b/train_gpt2.py index ab1c3e44d..ceda8ae5e 100644 --- a/train_gpt2.py +++ b/train_gpt2.py @@ -394,7 +394,7 @@ def print0(*args, **kwargs): # if you'd like to e.g. time the forward pass only, call this script as: # python train_gpt2.py --inference_only 1 --write_tensors 0 --sequence_length 1024 parser = argparse.ArgumentParser() - parser.add_argument("--input_bin", type=str, default="data/tiny_shakespeare_val.bin", help="input .bin to train on") + parser.add_argument("--input_bin", type=str, default="dev/data/tinyshakespeare/tiny_shakespeare_val.bin", help="input .bin to train on") parser.add_argument("--model", type=str, default="gpt2", help="gpt2|gpt2-medium|gpt2-large|gpt2-xl") parser.add_argument("--write_tensors", type=int, default=1, help="write tensors to disk") parser.add_argument("--inference_only", type=int, default=0, help="only run inference") diff --git a/train_gpt2_fp32.cu b/train_gpt2_fp32.cu index 178288f33..d2cf53b43 100644 --- a/train_gpt2_fp32.cu +++ b/train_gpt2_fp32.cu @@ -1595,7 +1595,7 @@ void error_usage() { // default run = debugging run with TinyShakespeare // bigger run = train on TinyStories! e.g. val/sample less often, but sample more tokens, write to logfile fprintf(stderr, "Usage: ./train_gpt2fp32cu [options]\n"); - fprintf(stderr, "Example: ./train_gpt2fp32cu -i data/TinyStories -v 100 -s 100 -g 144 -o stories.log\n"); + fprintf(stderr, "Example: ./train_gpt2fp32cu -i dev/data/tinystories/TinyStories -v 100 -s 100 -g 144 -o stories.log\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " -i input dataset prefix (default = data/tiny_shakespeare)\n"); fprintf(stderr, " -o output log file (default = NULL)\n"); @@ -1614,7 +1614,7 @@ void error_usage() { int main(int argc, char *argv[]) { // read in the (optional) command line arguments - const char* input_dataset_prefix = "data/tiny_shakespeare"; // or e.g. data/TinyStories + const char* input_dataset_prefix = "dev/data/tinyshakespeare/tiny_shakespeare"; // or e.g. data/TinyStories const char* output_log_file = NULL; int B = 4; // batch size int T = 1024; // sequence length max From f671cf92880ec52e5408c27a6b5767e03b708345 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 20 May 2024 23:02:41 +0000 Subject: [PATCH 115/172] more changes, trying to help people out because when this merges to master it will brick everyone's code... --- train_gpt2.c | 3 +++ train_gpt2.cu | 2 +- train_gpt2.py | 6 +++++- utils.h | 3 ++- 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/train_gpt2.c b/train_gpt2.c index 8fdf46e4b..41ec3147e 100644 --- a/train_gpt2.c +++ b/train_gpt2.c @@ -1021,6 +1021,9 @@ void dataloader_init(DataLoader *loader, const char* filename, int B, int T) { loader->tokens_file = fopen(filename, "rb"); if (loader->tokens_file == NULL) { printf("Error opening tokens file\n"); + printf("--> HINT: the data directory may have moved recently from data/ to dev/data/(dataset)/"); + printf("--> HINT: refer again to the README file and possibly re-run the dataset prepro script."); + printf("--> HINT: example: re-run `python dev/data/tinyshakespeare.py`"); exit(1); } diff --git a/train_gpt2.cu b/train_gpt2.cu index 0b69574e2..6584a4cd5 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -2622,7 +2622,7 @@ void error_usage() { fprintf(stderr, "Usage: ./train_gpt2cu [options]\n"); fprintf(stderr, "Example: ./train_gpt2cu -i dev/data/tinystories/TinyStories -v 100 -s 100 -g 144 -o stories.log\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -i input dataset prefix (default = data/tiny_shakespeare)\n"); + fprintf(stderr, " -i input dataset prefix (default = dev/data/tinyshakespeare/tiny_shakespeare)\n"); fprintf(stderr, " -e input model filename (default = gpt2_124M_bf16.bin)\n"); fprintf(stderr, " -o output log file (default = NULL)\n"); fprintf(stderr, " -b (per-GPU, micro) batch size B (default = 4)\n"); diff --git a/train_gpt2.py b/train_gpt2.py index ceda8ae5e..4d61e68cd 100644 --- a/train_gpt2.py +++ b/train_gpt2.py @@ -495,7 +495,11 @@ def print0(*args, **kwargs): # load the tokens # note we're using val by default instead of train split just because it is smaller/faster - assert os.path.isfile(args.input_bin) + if not os.path.isfile(args.input_bin): + print0(f"ERROR: input .bin file not found: {args.input_bin}") + print0("---> HINT: try to re-run the data prepro script. these recently moved to dev/data") + print0("---> HINT: for example re-run: `python dev/data/tinyshakespeare.py`, then re-try") + exit(1) print0(f"loading cached tokens in {args.input_bin}") with open(args.input_bin, "rb") as f: tokens = np.frombuffer(f.read(), dtype=np.int32) diff --git a/utils.h b/utils.h index 5d594cb6c..a40de67e3 100644 --- a/utils.h +++ b/utils.h @@ -24,7 +24,8 @@ FILE *fopen_check(const char *path, const char *mode, const char *file, int line fprintf(stderr, " Line: %d\n", line); fprintf(stderr, " Path: %s\n", path); fprintf(stderr, " Mode: %s\n", mode); - fprintf(stderr, "---> HINT: try to re-run `python train_gpt2.py`\n"); + fprintf(stderr, "---> HINT 1: dataset files/code have moved to dev/data recently (May 20, 2024). You may have to mv them from the legacy data/ dir to dev/data/(dataset), or re-run the data preprocessing script. Refer back to the main README\n"); + fprintf(stderr, "---> HINT 2: possibly try to re-run `python train_gpt2.py`\n"); exit(EXIT_FAILURE); } return fp; From 71774b3c3d83f651e143f04cc2ab79b9b757f2eb Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 20 May 2024 23:10:49 +0000 Subject: [PATCH 116/172] oops forgot to include data_common.py --- dev/data/data_common.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 dev/data/data_common.py diff --git a/dev/data/data_common.py b/dev/data/data_common.py new file mode 100644 index 000000000..d6b71ecbc --- /dev/null +++ b/dev/data/data_common.py @@ -0,0 +1,21 @@ +""" +Common utilities for the datasets +""" + +import requests +from tqdm import tqdm + +def download_file(url: str, fname: str, chunk_size=1024): + """Helper function to download a file from a given url""" + resp = requests.get(url, stream=True) + total = int(resp.headers.get("content-length", 0)) + with open(fname, "wb") as file, tqdm( + desc=fname, + total=total, + unit="iB", + unit_scale=True, + unit_divisor=1024, + ) as bar: + for data in resp.iter_content(chunk_size=chunk_size): + size = file.write(data) + bar.update(size) From 7d11b7996c86b79095e6a9967010f761d0c6f363 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 20 May 2024 23:13:42 +0000 Subject: [PATCH 117/172] i also forgot to include the readme file for the new dev/data dir --- dev/data/README.md | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 dev/data/README.md diff --git a/dev/data/README.md b/dev/data/README.md new file mode 100644 index 000000000..b13675c1e --- /dev/null +++ b/dev/data/README.md @@ -0,0 +1,8 @@ +# dev/data organization + +The idea is that each dataset has a .py file here in the root of `dev/data`, and each dataset then creates a directory here, and writes and caches anything inside that directory. So for example: + +- running `python tinystories.py` will create a directory `tinystories` with its .bin files inside it +- running `python tinyshakespeare.py` will create a directory `tinyshakespeare` with its .bin files inside it + +And so on. This way we can nicely organize multiple datasets here, share common utilities between them, and then point the .py/.c code in the root of the project accordingly to these. From bced34d71e0d5006da71b62861de4c58e82a66ed Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 20 May 2024 23:15:44 +0000 Subject: [PATCH 118/172] adjust gitignore --- .gitignore | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index f60885e23..ba0de8b90 100644 --- a/.gitignore +++ b/.gitignore @@ -2,12 +2,18 @@ .vscode .venv -# data files -data - # .bin files generated by Python *.bin +# data directories +dev/data/__pycache__/ +dev/data/fineweb.py +dev/data/fineweb/ +dev/data/hellaswag/ +dev/data/mmlu/ +dev/data/tinyshakespeare/ +dev/data/tinystories/ + # binaries test_gpt2 test_gpt2cu @@ -22,6 +28,7 @@ dev/cuda/classifier_fused dev/cuda/adamw dev/cuda/matmul_backward_bias dev/cuda/nccl_all_reduce +dev/cuda/global_norm *.obj *.exe *.o From c51cd70cfbfd2ef6f8ef4016b7f1a58bd741b314 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 20 May 2024 23:22:31 +0000 Subject: [PATCH 119/172] fix a slip in gitignore, i think i am getting tired today --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index ba0de8b90..05391b6d1 100644 --- a/.gitignore +++ b/.gitignore @@ -7,7 +7,6 @@ # data directories dev/data/__pycache__/ -dev/data/fineweb.py dev/data/fineweb/ dev/data/hellaswag/ dev/data/mmlu/ From 4bbd01fe120e8b45343154de1e2b7d886c2b0c1e Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 21 May 2024 00:16:07 +0000 Subject: [PATCH 120/172] add fineweb, and add the first version of a new write_shard function that contains a header properly --- dev/data/data_common.py | 22 +++++++++++ dev/data/fineweb.py | 83 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 dev/data/fineweb.py diff --git a/dev/data/data_common.py b/dev/data/data_common.py index d6b71ecbc..c3147de76 100644 --- a/dev/data/data_common.py +++ b/dev/data/data_common.py @@ -4,6 +4,8 @@ import requests from tqdm import tqdm +import numpy as np + def download_file(url: str, fname: str, chunk_size=1024): """Helper function to download a file from a given url""" @@ -19,3 +21,23 @@ def download_file(url: str, fname: str, chunk_size=1024): for data in resp.iter_content(chunk_size=chunk_size): size = file.write(data) bar.update(size) + + +def write_shard(filename, toks): + """Saves token data as a .bin file, for reading in C""" + assert len(toks) < 2**31, "token count too large" # ~2.1B tokens + # construct the header + header = np.zeros(256, dtype=np.int32) + header[0] = 20240520 # magic + header[1] = 1 # version + header[2] = len(toks) # number of tokens after the 256*4 bytes of header (each 2 bytes as uint16) + # validate that no token exceeds a uint16 + maxtok = 2**16 + assert all(0 <= t < maxtok for t in toks), "token dictionary too large for uint16" + # construct the tokens + toks_np = np.array(toks, dtype=np.uint16) + # write to file + print(f"writing {filename}") + with open(filename, "wb") as f: + f.write(header.tobytes()) + f.write(toks_np.tobytes()) diff --git a/dev/data/fineweb.py b/dev/data/fineweb.py new file mode 100644 index 000000000..8b6ef4bfa --- /dev/null +++ b/dev/data/fineweb.py @@ -0,0 +1,83 @@ +""" +FineWeb dataset (for srs pretraining) +https://huggingface.co/datasets/HuggingFaceFW/fineweb + +example doc to highlight the structure of the dataset: +{ + "text": "Posted by mattsmith on 20th April 2012\nStraight from...", + "id": "", + "dump": "CC-MAIN-2013-20", + "url": "http://nleastchatter.com/philliesphandom/tag/freddy-galvis/", + "date": "2013-05-18T07:24:47Z", + "file_path": "s3://commoncrawl/long.../path.../file.gz", + "language": "en", + "language_score": 0.9185474514961243, + "token_count": 594 +} +""" +import os +import argparse +import multiprocessing as mp +import numpy as np +import tiktoken +# from huggingface_hub import snapshot_download +from datasets import load_dataset +from tqdm import tqdm +import argparse + +from data_common import write_shard +# ------------------------------------------ + +parser = argparse.ArgumentParser(description="FineWeb dataset preprocessing") +parser.add_argument("-s", "--shard_size", type=int, default=10**9, help="Size of each shard in tokens") +args = parser.parse_args() + +# create the cache directory if it doesn't exist yet +DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "fineweb10B") +os.makedirs(DATA_CACHE_DIR, exist_ok=True) + +# todo is this needed? or just the load_dataset below? +# download 10B Tokens sample (~28GB on disk) +# folder = snapshot_download( +# "HuggingFaceFW/fineweb", +# repo_type="dataset", +# local_dir="./data/fineweb/", +# allow_patterns="sample/10BT/*" +# ) +fw = load_dataset("HuggingFaceFW/fineweb", name="sample-10BT", split="train") + +# init the tokenizer +enc = tiktoken.get_encoding("gpt2") +eot = enc._special_tokens['<|endoftext|>'] # end of text token + +# helper functions +def tokenize(doc): + return enc.encode_ordinary(doc["text"]) + +# main loop write files +pool = mp.Pool() +shard_index = 0 +all_tokens = [] +progress_bar = None +for tokens in pool.imap(tokenize, fw): + + # record the tokens and make sure to separate documents + all_tokens.append(eot) + all_tokens.extend(tokens) + + # update progress bar + if progress_bar is None: + progress_bar = tqdm(total=args.shard_size, unit="tokens", desc=f"Shard {shard_index}") + progress_bar.update(len(tokens)) + + # if we reach shard_size tokens, write shard to disk + if len(all_tokens) >= args.shard_size: + filename = os.path.join(DATA_CACHE_DIR, f"fineweb_{shard_index:06d}.bin") + write_tokens = all_tokens[:args.shard_size] + rest_tokens = all_tokens[args.shard_size:] + write_shard(filename, write_tokens) + shard_index += 1 + progress_bar = None + # note: create a copy so Python can free the all_tokens memory above + # the list rest_tokens is expected to be very small + all_tokens = [t for t in rest_tokens] From b5e75dde8e8d20b13177b060f3ed364bbe50eb12 Mon Sep 17 00:00:00 2001 From: ademeure Date: Tue, 21 May 2024 15:57:07 +0100 Subject: [PATCH 121/172] Fully deterministic encoder backward kernels for train_gpt2.cu --- profile_gpt2.cu | 2 +- test_gpt2.cu | 2 +- train_gpt2.cu | 245 ++++++++++++++++++++++++++++++++++++------------ 3 files changed, 189 insertions(+), 60 deletions(-) diff --git a/profile_gpt2.cu b/profile_gpt2.cu index f2ac0e84c..f79e9ada4 100644 --- a/profile_gpt2.cu +++ b/profile_gpt2.cu @@ -54,7 +54,7 @@ int main(int argc, char *argv[]) { // do a training step gpt2_forward(&model, x, y, B, T); gpt2_zero_grad(&model); - gpt2_backward(&model); + gpt2_backward(&model, x); gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1.f, 1, &multi_gpu_config); cudaCheck(cudaDeviceSynchronize()); // finish all CUDA work to get correct precise timings diff --git a/test_gpt2.cu b/test_gpt2.cu index 50a291f18..d06734507 100644 --- a/test_gpt2.cu +++ b/test_gpt2.cu @@ -203,7 +203,7 @@ int main(int argc, char *argv[]) { clock_gettime(CLOCK_MONOTONIC, &start); gpt2_forward(&model, x, y, B, T); gpt2_zero_grad(&model); - gpt2_backward(&model); + gpt2_backward(&model, x); clock_gettime(CLOCK_MONOTONIC, &end); double time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9; diff --git a/train_gpt2.cu b/train_gpt2.cu index 1e8b54be2..899293f75 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -38,6 +38,9 @@ This reads & runs in fp32, B=4, T=64, LR=1e-4, val/sample never (200), #include #include #include +#include +#include +#include // GPU / CUDA related #include #include @@ -532,50 +535,108 @@ __global__ void encoder_forward_kernel3(floatX* out, store128(out_btc, packed_out); } -template -__device__ void atomicStochasticAdd(T* address, float val0, float val1, unsigned int seed) { - static_assert(sizeof(T) == 2, "Only 16-bit atomicStochasticAdd supported."); - float2 val = make_float2(val0, val1); - unsigned int* address_as_uint = (unsigned int*)address; - unsigned int old = *address_as_uint, assumed; - unsigned int random = Get2dNoiseUint(threadIdx.x, blockIdx.x, seed); - do { - assumed = old; - float2 new_fp32 = make_float2((float)(reinterpret_cast(&old)[0]) + val.x, - (float)(reinterpret_cast(&old)[1]) + val.y); - T new_rounded[2]; - stochastic_rounding(new_fp32.x, &new_rounded[0], random); - stochastic_rounding(new_fp32.y, &new_rounded[1], random >> 16); - old = atomicCAS(address_as_uint, assumed, *(unsigned int*)&new_rounded); - } while (assumed != old); -} -__device__ void atomicStochasticAdd(float* address, float val0, float val1, unsigned int seed) { - atomicAdd(address, val0); - atomicAdd(address + 1, val1); -} - -__global__ void encoder_backward_kernel(floatX* dwte, floatX* dwpe, - const floatX* dout, const int* inp, - int B, int T, int C, unsigned int seed) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - int N = B * T * C; - idx *= 2; // 2 elements per thread - if (idx >= N) { return; } +template +__global__ void wte_backward_kernel(floatX* dwte, + const int4* bucket_info, const int* workload_indices, const floatX* dout, const int* inp, + unsigned int seed, int B, int T, int C) { + // In order to be deterministic, we preprocess the inputs on the cpu into "buckets" + // Each bucket corresponds to (WARP_SIZE * x128::size) channels for a single vocabulary token + // Each thread handles x128::size channels, e.g. 256 per warp for BF16 + // Each block handles (BLOCK_SIZE / WARP_SIZE) elements in a single bucket in parallel + // If a bucket has less than 8 elements, some warps will return immediately + // If a bucket has more than 8 elements, we will loop over all of them + // The buckets are sorted on the CPU so the largest buckets start 1st + int bucket = blockIdx.x; + int warp_id = threadIdx.x / WARP_SIZE; + int lane_id = threadIdx.x % WARP_SIZE; + int c_per_warp = WARP_SIZE * x128::size; + + int bucket_start_idx = bucket_info[bucket].x; + int bucket_size = bucket_info[bucket].y; + int bucket_ix = bucket_info[bucket].z; + int c = bucket_info[bucket].w * c_per_warp + (lane_id * x128::size); + + // Each thread handles "x128::size" channels, so at fp8, each warp would handle 512 channels + // If C is not a multiple of this (e.g. 768), some buckets/c_groups cannot use the entire warp + if (c >= C) { return; } + // Exit early if this is a small bucket and this warp doesn't have any items to process + if (warp_id >= bucket_size) { return; } + + float accum[x128::size] = {0.0f}; + __shared__ float accum_shared[x128::size * BLOCK_SIZE]; + + for(int item = warp_id; item < bucket_size; item += BLOCK_SIZE/WARP_SIZE) { + int bt = workload_indices[bucket_start_idx + item]; + int b = bt / T; + int t = bt % T; + + const floatX* dout_btc = dout + b * T * C + t * C + c; + x128 packed_inp1 = load128cs(dout_btc); + for (int k = 0; k < packed_inp1.size; k++) { + accum[k] += (float)packed_inp1[k]; + } + } - int bt = idx / C; - int b = bt / T; - int t = bt % T; - int c = idx % C; + if (warp_id != 0) { + // we accumulate into warp 0, so only the other warps need to write to shared memory + for (int k = 0; k < x128::size; k++) { + accum_shared[threadIdx.x + k * BLOCK_SIZE] = accum[k]; + } + return; // only warp 0 is needed after writing to shared memory + } - int ix = inp[b * T + t]; + // Read dwte for warp 0 even if other warps are not finished yet to maximise latency tolerance + floatX* dwte_ix = dwte + bucket_ix * C + c; + x128 packed_in_out = load128(dwte_ix); - const floatX* dout_btc = dout + b * T * C + t * C + c; - floatX* dwte_ix = dwte + ix * C + c; - floatX* dwpe_tc = dwpe + t * C + c; + // note: threads which have returned are considered synchronised by CUDA so no risk of deadlock + __syncthreads(); - float2 dout_data = make_float2(dout_btc[0], dout_btc[1]); - atomicStochasticAdd(dwte_ix, dout_data.x, dout_data.y, seed); - atomicStochasticAdd(dwpe_tc, dout_data.x, dout_data.y, seed ^ 0xFFFFFFFF); + // Accumulate into warp 0's registers by reading the values of the other warps in shared memory + for (int i = threadIdx.x+WARP_SIZE; i < min(BLOCK_SIZE, bucket_size*WARP_SIZE); i += WARP_SIZE) { + for (int k = 0; k < x128::size; k++) { + accum[k] += accum_shared[i + k * BLOCK_SIZE]; + } + } + + // Add the result to dwte and write back to global memory (read-modify-write) + for (unsigned int k = 0; k < x128::size; k++) { + // We use stochastic rounding to go from FP32 to BF16 but the seed should be deterministic + stochastic_rounding(accum[k] + (float)packed_in_out[k], &packed_in_out[k], seed + k); + } + store128(dwte_ix, packed_in_out); +} + +__global__ void wpe_backward_kernel(floatX* dwpe, + const floatX* dout, const int* inp, + int B, int T, int C, unsigned int seed) { + // Each thread handles x128::size "channel positions", e.g. 256 per warp for BF16 + // For gpt2-124M BF16, C=768 and T=1024, so 3 warps per channel and 3072 warps in total + // For each "channel position" we sum the gradients for every batch at that C/T element + // This way each dwte element is only updated once, and the kernel is fully deterministic! + // The previous kernel was not deterministic, as batches were aggregated with atomicAdd + int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size; + if (idx >= T * C) { return; } + + // if C is not a multiple of WARP_SIZE*x128::size, it's OK for some warps to handle multiple t + int t = idx / C; + int c = idx % C; + float accum[x128::size] = {0.0f}; + + for (int b = 0; b < B; b++) { + x128 packed_dout = load128cs(dout + (b * T * C) + (t * C) + c); // will never be read again + for (int k = 0; k < x128::size; k++) { + accum[k] += (float)packed_dout[k]; + } + } + + floatX* dwpe_tc = dwpe + (t * C) + c; + x128 packed_dwpe = load128(dwpe_tc); + for (unsigned int k = 0; k < x128::size; k++) { + // We use stochastic rounding to go from FP32 to BF16 but the seed should be deterministic + stochastic_rounding(accum[k] + (float)packed_dwpe[k], &packed_dwpe[k], seed + k); + } + store128(dwpe_tc, packed_dwpe); } __global__ void layernorm_forward_kernel3(floatX* __restrict__ out, floatX* __restrict__ mean, floatX* __restrict__ rstd, @@ -783,10 +844,9 @@ __global__ void softmax_forward_kernel5(floatX* out, float inv_temperature, cons // directly autoregressive, so we only compute the lower triangular part // uses the online softmax algorithm assert(T % 4 == 0); - const int warp_size = 32; - int lane_id = threadIdx.x % warp_size; - int warp_id = threadIdx.x / warp_size; - int num_warps = blockDim.x / warp_size; + int lane_id = threadIdx.x % WARP_SIZE; + int warp_id = threadIdx.x / WARP_SIZE; + int num_warps = blockDim.x / WARP_SIZE; // micro-optimization: we iterate backwards so that // after the softmax backward operation completes, the cache retains the @@ -809,7 +869,7 @@ __global__ void softmax_forward_kernel5(floatX* out, float inv_temperature, cons float sumval = 0.0f; const floatX* x_aligned = reinterpret_cast(__builtin_assume_aligned(x, 16)); - for (int i = lane_id; i < pos_by_4; i += warp_size) { + for (int i = lane_id; i < pos_by_4; i += WARP_SIZE) { float regarray[4]; for (int k = 0; k < 4; ++k) { regarray[k] = (float)x_aligned[4*i + k]; @@ -838,7 +898,7 @@ __global__ void softmax_forward_kernel5(floatX* out, float inv_temperature, cons float norm = 1.f / sum; // divide the whole row by the sum - for (int i = lane_id; i <= own_pos; i += warp_size) { + for (int i = lane_id; i <= own_pos; i += WARP_SIZE) { // recalculation is faster than doing the round-trip through memory. float ev = expf(inv_temperature * ((float)__ldcs(x + i) - global_maxval)); __stcs(out + idx * T + i, (floatX)(ev * norm)); @@ -1354,14 +1414,70 @@ void encoder_forward(floatX* out, cudaCheck(cudaGetLastError()); } -void encoder_backward(floatX* dwte, floatX* dwpe, - const floatX* dout, const int* inp, - int B, int T, int C, unsigned int seed) { +// Fully deterministic (see comments in wte_backward_kernel and wpe_backward_kernel for more details) +void encoder_backward(floatX* dwte, floatX* dwpe, floatX* scratch, // gpu outputs & scratch + int* workload_indices, int4* bucket_info, // cpu scratch buffers + const floatX* dout, const int* inp, const int* inputs_cpu, // cpu/gpu inputs + int B, int T, int C, unsigned int seed) { NVTX_RANGE_FN(); - const int N = B * T * C; + + // Launch wpe kernel first (so it runs on the GPU in parallel with the CPU pre-processing for wte) const int block_size = 256; - const int grid_size = CEIL_DIV(N, block_size * 2); // each thread handles 2 elements - encoder_backward_kernel<<>>(dwte, dwpe, dout, inp, B, T, C, seed); + const int N = T * C / x128::size; + const int grid_size = CEIL_DIV(N, block_size); + wpe_backward_kernel<<>>(dwpe, dout, inp, B, T, C, seed); + + // check the GPU scratch buffer is large enough to hold the bucket info and workload indices + // todo - this is trivially true given hardcoded scratch buffer size here, is this useful? + int num_c_groups = CEIL_DIV(C, x128::size * WARP_SIZE); + assert(B*T*num_c_groups * (sizeof(int4)+sizeof(int)) <= B*T*3*C * sizeof(floatX)); + + // Step 1: Sort inputs into buckets + int total_items = 0; + std::unordered_map> buckets; + for (uint64_t bt = 0; bt < B * T; bt++) { + for (uint64_t c_group = 0; c_group < num_c_groups; c_group++) { + // todo - passing c_group/inputs_cpu[bt] in data to avoid a second hash lookup is a bit hacky + uint64_t data = bt + (c_group<<32ULL) + ((uint64_t)inputs_cpu[bt]<<42ULL); + buckets[c_group + num_c_groups * inputs_cpu[bt]].push_back(data); + total_items++; + } + } + + // Step 2: Sort buckets by size in descending order + // this is so the largest buckets are processed first by the GPU + // otherwise, if they started late, they would still be running with the rest of the GPU idle + std::vector>> sortedBuckets(buckets.begin(), buckets.end()); + std::sort(sortedBuckets.begin(), sortedBuckets.end(), // ugly because we don't have a typedef for the std::pair + [](const std::pair>& a, const std::pair>& b) { + return a.second.size() > b.second.size(); + }); + + int num_buckets = buckets.size(); + int bucket_index = 0; + int workload_index = 0; + for (const auto& bucket : sortedBuckets) { + bucket_info[bucket_index].x = workload_index; // bucket start + bucket_info[bucket_index].y = bucket.second.size(); // bucket size + bucket_info[bucket_index].z = (bucket.second[0] >> 42ULL) & ((1ULL<<20ULL)-1); // bucket ix + bucket_info[bucket_index].w = (bucket.second[0] >> 32ULL) & ((1ULL<<10ULL)-1); // bucket c + + for (uint64_t idx : bucket.second) { + workload_indices[workload_index++] = (int)(idx & ((1ULL<<31ULL)-1ULL)); + } + bucket_index++; + } + + // Step 3: Copy data from host to device (async until the last one to avoid synchronising CPU/GPU twice) + // todo - could use CUDA events (even without streams) to avoid CPU/GPU synchronisation completely + int4* d_bucket_info = (int4*)scratch; + int* d_workload_indices = (int*)(scratch + B*T*num_c_groups * sizeof(int4)); + cudaMemcpyAsync(d_bucket_info, bucket_info, num_buckets * sizeof(int4), cudaMemcpyHostToDevice); + cudaMemcpy(d_workload_indices, workload_indices, total_items * sizeof(int), cudaMemcpyHostToDevice); + + // Launch wte kernel + // todo - profile block sizes on more content (depends on number of buckets and on GPU?) + wte_backward_kernel<256><<>>(dwte, d_bucket_info, d_workload_indices, dout, inp, seed, B, T, C); cudaCheck(cudaGetLastError()); } @@ -1947,6 +2063,9 @@ typedef struct { unsigned long long rng_state; // the RNG state for seeding stochastic rounding etc. int use_master_weights; int recompute; + // todo - if other functions need cpu scratch buffers in the future, reuse as generic scratch? + int* workload_indices; // encoder_backward, B*T*num_c_groups (int) + int4* bucket_info; // encoder_backward, B*T*num_c_groups (int4) - size for worst case } GPT2; void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) { @@ -2022,6 +2141,8 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) { model->inputs = NULL; model->targets = NULL; model->cpu_losses = NULL; + model->workload_indices = NULL; + model->bucket_info = NULL; model->batch_size = 0; model->seq_len = 0; model->mean_loss = -1.0f; // -1.0f will designate no loss @@ -2195,7 +2316,7 @@ void gpt2_zero_grad(GPT2 *model) { } } -void gpt2_backward(GPT2 *model) { +void gpt2_backward(GPT2 *model, int* inputs) { NVTX_RANGE_FN(); // double check we forwarded previously, with targets if (model->mean_loss == -1.0f) { @@ -2221,6 +2342,11 @@ void gpt2_backward(GPT2 *model) { model->grads_acts_memory = malloc_and_point_backward(&model->grads_acts, bw_act_sizes); // init gradients of parameters and activations to zero gpt2_zero_grad(model); + // initialise cpu scratch buffers for encoder backward + size_t num_c_groups = model->config.channels / (WARP_SIZE * x128::size); + assert((size_t)(model->batch_size * model->seq_len) * num_c_groups < (1ULL<<31ULL)); // todo - maybe an issue for llama3-400B(?) + model->workload_indices = (int*)mallocCheck(sizeof(int) * model->batch_size * model->seq_len * num_c_groups); + model->bucket_info = (int4*)mallocCheck(sizeof(int4) * model->batch_size * model->seq_len * num_c_groups); } // convenience shortcuts, size_t instead of int so that pointer arithmetics don't overflow @@ -2241,7 +2367,8 @@ void gpt2_backward(GPT2 *model) { cudaCheck(cudaMemset(model->grads_acts.residual3, 0, B * T * C * sizeof(floatX))); // re-use the output buffer of the forward pass as a scratchpad during backward pass - float* scratchF = (float*)acts.output; + float* scratchF = (float*)acts.output; + floatX* scratchX = (floatX*)acts.output; // we kick off the chain rule by filling in dlosses with 1.0f/(B*T) // this was done in the fused classifier kernel as last step of forward pass @@ -2323,7 +2450,6 @@ void gpt2_backward(GPT2 *model) { floatX* buffer_a = l_atty; floatX* buffer_b = l_fch; // this is B x T x 4C, so even larger than what we need floatX* dl_preatt = (floatX*)grads_acts.preatt; // dedicated scratchpad allocation - floatX* scratchX = (floatX*)acts.output; attention_backward(dl_bt4c, buffer_b, dl_preatt, scratchX, buffer_a, dl_btc, l_qkvr, l_att, B, T, C, NH); #endif @@ -2332,7 +2458,8 @@ void gpt2_backward(GPT2 *model) { // layernorm backward does += to dresidual, so it correctly accumulates gradient for the Attention block above layernorm_backward(dresidual, dl_ln1w, dl_ln1b, scratchF, dl_btc, residual, l_ln1w, l_ln1_mean, l_ln1_rstd, B, T, C); } - encoder_backward(grads.wte, grads.wpe, dresidual, model->inputs, B, T, C, random_u32(&model->rng_state)); + encoder_backward(grads.wte, grads.wpe, scratchX, model->workload_indices, model->bucket_info, + dresidual, model->inputs, inputs, B, T, C, random_u32(&model->rng_state)); } // Compute a mean of a single CPU value across all GPU processes. No-op when multi-GPU is disabled. @@ -2448,6 +2575,8 @@ void gpt2_free(GPT2 *model) { cudaCheck(cudaFree(model->inputs)); cudaCheck(cudaFree(model->targets)); cudaFreeHost(model->cpu_losses); + free(model->workload_indices); + free(model->bucket_info); } // ---------------------------------------------------------------------------- @@ -2477,7 +2606,7 @@ void common_free(GPT2 &model) { cudaCheck(cudaFree(cublaslt_workspace)); cublasCheck(cublasDestroy(cublas_handle)); cublasCheck(cublasLtDestroy(cublaslt_handle)); - create_cudnn(); + destroy_cudnn(); } #ifndef TESTING @@ -2880,7 +3009,7 @@ int main(int argc, char *argv[]) { gpt2_forward(&model, train_loader.inputs, train_loader.targets, B, T, grad_accum_steps); lossf += model.mean_loss; // the mean_loss was normalized by grad_accum_steps inside gpt2_forward // backward pass. all model params accumulate gradients with += inside this inner loop - gpt2_backward(&model); + gpt2_backward(&model, train_loader.inputs); } // override the mean loss, accounting for the gradient accumulation loop // this is esp important to do here in multigpu update below, where model.mean_loss gets allreduced From f7cbb303168de1b21aa8d9c8814e3a4a9237fae3 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 21 May 2024 15:32:46 +0000 Subject: [PATCH 122/172] step 2 of dataloader refactor: separate out the dataloader to its own file, change its signature a little bit, and (notably) change from cudaMallocHost to a simple malloc, so that this file doesn't have to be cuda aware --- dataloader.h | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++ train_gpt2.cu | 86 +++--------------------------------------------- 2 files changed, 96 insertions(+), 81 deletions(-) create mode 100644 dataloader.h diff --git a/dataloader.h b/dataloader.h new file mode 100644 index 000000000..4ca0239fa --- /dev/null +++ b/dataloader.h @@ -0,0 +1,91 @@ +/* +Implements a medium simple DataLoader for a distributed training setup. +*/ + +#include +#include +#include +#include +// defines: fopenCheck, freadCheck, fcloseCheck, fseekCheck +// defines: mallocCheck +#include "utils.h" + +// ---------------------------------------------------------------------------- +// Distributed Data Loader + +typedef struct { + // Distributed data parallel specifics. + // Each worker loads it's own chunk of data. + int process_rank; + int num_processes; + // hyperparameters. use size_t to prevent overflow + size_t B; + size_t T; + // input handling and its state + FILE* tokens_file; + long file_size; + long current_position; + // outputs + int* batch; + int* inputs; + int* targets; + // convenience variables + size_t num_batches; +} DataLoader; + +void dataloader_init(DataLoader *loader, + const char* filename, + size_t B, + size_t T, + int process_rank, + int num_processes) { + loader->process_rank = process_rank; + loader->num_processes = num_processes; + loader->B = B; + loader->T = T; + + // open the input file for reading + loader->tokens_file = fopenCheck(filename, "rb"); + + // determine the file size + fseekCheck(loader->tokens_file, 0, SEEK_END); + loader->file_size = ftell(loader->tokens_file); + fseekCheck(loader->tokens_file, 0, SEEK_SET); + if (loader->file_size < (B * T + 1) * sizeof(int)) { + printf("Error: file size is too small for the batch size and sequence length\n"); + exit(EXIT_FAILURE); + } + loader->current_position = loader->process_rank * B * T * sizeof(int); // start at the beginning + + // allocate space for B*T + 1 integers to store the inputs and targets + loader->batch = (int*)malloc((B * T + 1) * sizeof(int)); + loader->inputs = loader->batch; + loader->targets = loader->batch + 1; // targets are shifted by one + // note: we definitely want to advance by B * T; That is the "stride" by which we move + // the window of tokens. We only load B * T + 1 tokens because our targets are offset by 1 + loader->num_batches = loader->file_size / (loader->num_processes * B * T * sizeof(int)); +} + +void dataloader_reset(DataLoader *loader) { + loader->current_position = 0; +} + +void dataloader_next_batch(DataLoader *loader) { + size_t B = loader->B; + size_t T = loader->T; + // if we are at the end of the file, loop back to the beginning + if (loader->current_position + (loader->num_processes * B * T + 1) * sizeof(int) > loader->file_size) { + loader->current_position = loader->process_rank * B * T * sizeof(int); + } + // read the B*T+1 integers from the file into batch + fseekCheck(loader->tokens_file, loader->current_position, SEEK_SET); + freadCheck(loader->batch, sizeof(int), B*T+1, loader->tokens_file); + // advance the current position by B*T*num_processes integers + // note: the "stride" of tokens by which we move each time is definitely B * T + loader->current_position += loader->num_processes * B * T * sizeof(int); +} + +void dataloader_free(DataLoader *loader) { + free(loader->batch); + fcloseCheck(loader->tokens_file); +} diff --git a/train_gpt2.cu b/train_gpt2.cu index 6584a4cd5..f3dfca5c2 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -54,6 +54,8 @@ This reads & runs in fp32, B=4, T=64, LR=1e-4, val/sample never (200), #include "utils.h" // defines: tokenizer_init, tokenizer_decode, tokenizer_free #include "tokenizer.h" +// defines: dataloader_init, dataloader_reset, dataloader_next_batch, dataloader_free +#include "dataloader.h" // ---------------------------------------------------------------------------- // CUDA precision settings @@ -2481,85 +2483,7 @@ void common_free(GPT2 &model) { } #ifndef TESTING -// if we are TESTING (see test_gpt2.cu), we'll skip the int main below -// ---------------------------------------------------------------------------- -// data loader lite: returns random batches of data from a file of integers - -typedef struct { - // Distributed data parallel specifics. - // Each worker loads it's own chunk of data. - int process_rank; - int num_processes; - // hyperparameters. use size_t to prevent overflow - size_t B; - size_t T; - // input handling and its state - FILE* tokens_file; - long file_size; - long current_position; - // output memory - int* batch; - int* inputs; - int* targets; - // convenience variables - size_t num_batches; -} DataLoader; - -void dataloader_init(DataLoader *loader, const MultiGpuConfig* multi_gpu_config, const char* filename, size_t B, size_t T) { - loader->process_rank = multi_gpu_config->process_rank; - loader->num_processes = multi_gpu_config->num_processes; - loader->B = B; - loader->T = T; - - // open the input file for reading - loader->tokens_file = fopenCheck(filename, "rb"); - - // determine the file size - fseekCheck(loader->tokens_file, 0, SEEK_END); - loader->file_size = ftell(loader->tokens_file); - fseekCheck(loader->tokens_file, 0, SEEK_SET); - if (loader->file_size < (B * T + 1) * sizeof(int)) { - printf("Error: file size is too small for the batch size and sequence length\n"); - exit(EXIT_FAILURE); - } - loader->current_position = loader->process_rank * B * T * sizeof(int); // start at the beginning - - // allocate space for B*T + 1 integers to store the inputs and targets - // Using CUDA CPU pinned memory for faster PCI Express transfers to GPU - // See: https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/ - cudaMallocHost((void**)&loader->batch, (B * T + 1) * sizeof(int)); - loader->inputs = loader->batch; - loader->targets = loader->batch + 1; // targets are shifted by one - // note: we definitely want to advance by B * T; That is the "stride" by which we move - // the window of tokens. We only load B * T + 1 tokens because our targets are offset by 1 - loader->num_batches = loader->file_size / (loader->num_processes * B * T * sizeof(int)); -} - -void dataloader_reset(DataLoader *loader) { - loader->current_position = 0; -} - -void dataloader_next_batch(DataLoader *loader) { - NVTX_RANGE_FN(); - size_t B = loader->B; - size_t T = loader->T; - // if we are at the end of the file, loop back to the beginning - if (loader->current_position + (loader->num_processes * B * T + 1) * sizeof(int) > loader->file_size) { - loader->current_position = loader->process_rank * B * T * sizeof(int); - } - // read the B*T+1 integers from the file into batch - fseekCheck(loader->tokens_file, loader->current_position, SEEK_SET); - freadCheck(loader->batch, sizeof(int), B*T+1, loader->tokens_file); - // advance the current position by B*T*num_processes integers - // note: the "stride" of tokens by which we move each time is definitely B * T - loader->current_position += loader->num_processes * B * T * sizeof(int); -} - -void dataloader_free(DataLoader *loader) { - fcloseCheck(loader->tokens_file); - cudaFreeHost(loader->batch); -} - +// if we are TESTING (see test_gpt2.cu), we'll skip everything below this point // ---------------------------------------------------------------------------- // sampler: takes probabilities and samples integers from them @@ -2747,8 +2671,8 @@ int main(int argc, char *argv[]) { sprintf(train_tokens_filename, "%s_%s.bin", input_dataset_prefix, train_split); sprintf(val_tokens_filename, "%s_val.bin", input_dataset_prefix); DataLoader train_loader, val_loader; - dataloader_init(&train_loader, &multi_gpu_config, train_tokens_filename, B, T); - dataloader_init(&val_loader, &multi_gpu_config, val_tokens_filename, B, T); + dataloader_init(&train_loader, train_tokens_filename, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes); + dataloader_init(&val_loader, val_tokens_filename, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes); int train_num_batches = (max_steps == -1) ? train_loader.num_batches : max_steps; // default = 1 epoch int val_num_batches = train_loader.num_batches < val_max_batches ? train_loader.num_batches : val_max_batches; printf0("| train_num_batches | %-50d |\n", train_num_batches); From a3801f01efae3434d6e4cdbef3dd455fcc10404f Mon Sep 17 00:00:00 2001 From: ademeure Date: Tue, 21 May 2024 16:53:11 +0100 Subject: [PATCH 123/172] added algorithm header for std::sort on windows (not sure about compile time impact...) --- train_gpt2.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/train_gpt2.cu b/train_gpt2.cu index 899293f75..16f8a4216 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -39,6 +39,7 @@ This reads & runs in fp32, B=4, T=64, LR=1e-4, val/sample never (200), #include #include #include +#include #include #include // GPU / CUDA related From 1defbd4a19e4fcfe356175a4aac7dd01f6c2e56a Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 21 May 2024 17:05:27 +0000 Subject: [PATCH 124/172] adjust the dataloader to load the new .bin data files, and both prod datasets to use it instead --- dataloader.h | 72 +++++++++++++++++++++++++------------ dev/data/data_common.py | 10 ++++-- dev/data/fineweb.py | 4 +-- dev/data/tinyshakespeare.py | 11 ++---- dev/data/tinystories.py | 6 ++-- 5 files changed, 63 insertions(+), 40 deletions(-) diff --git a/dataloader.h b/dataloader.h index 4ca0239fa..fa4e62adc 100644 --- a/dataloader.h +++ b/dataloader.h @@ -12,6 +12,7 @@ Implements a medium simple DataLoader for a distributed training setup. // ---------------------------------------------------------------------------- // Distributed Data Loader +#define HEADER_SIZE 256 typedef struct { // Distributed data parallel specifics. @@ -26,13 +27,20 @@ typedef struct { long file_size; long current_position; // outputs - int* batch; - int* inputs; - int* targets; + uint16_t* buffer; // used to fread data from file into + int* inputs; // input tokens into transformer + int* targets; // target tokens for the transformer // convenience variables size_t num_batches; } DataLoader; +void dataloader_reset(DataLoader *loader) { + // each process starts at a different offset in the file + long header_bytes = HEADER_SIZE * sizeof(int); + long token_bytes_offset = loader->process_rank * loader->B * loader->T * sizeof(uint16_t); + loader->current_position = header_bytes + token_bytes_offset; +} + void dataloader_init(DataLoader *loader, const char* filename, size_t B, @@ -46,46 +54,64 @@ void dataloader_init(DataLoader *loader, // open the input file for reading loader->tokens_file = fopenCheck(filename, "rb"); + // validate the header + int header[HEADER_SIZE]; + freadCheck(header, sizeof(int), HEADER_SIZE, loader->tokens_file); + if (header[0] != 20240520) { printf("Bad magic in data file\n"); exit(EXIT_FAILURE); } + if (header[1] != 1) { printf("Bad version in data file\n"); exit(EXIT_FAILURE); } + long ntok = header[2]; // number of tokens in the file - // determine the file size - fseekCheck(loader->tokens_file, 0, SEEK_END); - loader->file_size = ftell(loader->tokens_file); - fseekCheck(loader->tokens_file, 0, SEEK_SET); - if (loader->file_size < (B * T + 1) * sizeof(int)) { - printf("Error: file size is too small for the batch size and sequence length\n"); + // determine the file size and make sure it is consistent with the number of tokens + fseekCheck(loader->tokens_file, 0, SEEK_END); // seek to end of file + loader->file_size = ftell(loader->tokens_file); // read the offset, i.e. file size + fseekCheck(loader->tokens_file, 0, SEEK_SET); // seek back to the beginning + // we expect ntok in the file to be consistent with filesize, assert that is the case + long expected_file_size = HEADER_SIZE * sizeof(int) + ntok * sizeof(uint16_t); + if (loader->file_size != expected_file_size) { + printf("Error: file size is not as expected\n"); + exit(EXIT_FAILURE); + } + if (ntok < num_processes * B * T + 1) { + // being too defensive/lazy, we could tolerate as low as T+1 tokens in principle + printf("Error: there are too few tokens\n"); exit(EXIT_FAILURE); } - loader->current_position = loader->process_rank * B * T * sizeof(int); // start at the beginning // allocate space for B*T + 1 integers to store the inputs and targets - loader->batch = (int*)malloc((B * T + 1) * sizeof(int)); - loader->inputs = loader->batch; - loader->targets = loader->batch + 1; // targets are shifted by one + loader->buffer = (uint16_t*)malloc((B * T + 1) * sizeof(uint16_t)); + loader->inputs = (int*)malloc(B * T * sizeof(int)); + loader->targets = (int*)malloc(B * T * sizeof(int)); // note: we definitely want to advance by B * T; That is the "stride" by which we move // the window of tokens. We only load B * T + 1 tokens because our targets are offset by 1 - loader->num_batches = loader->file_size / (loader->num_processes * B * T * sizeof(int)); -} + loader->num_batches = ntok / (num_processes * B * T); -void dataloader_reset(DataLoader *loader) { - loader->current_position = 0; + // reset the loader to the beginning of the file + dataloader_reset(loader); } void dataloader_next_batch(DataLoader *loader) { size_t B = loader->B; size_t T = loader->T; // if we are at the end of the file, loop back to the beginning - if (loader->current_position + (loader->num_processes * B * T + 1) * sizeof(int) > loader->file_size) { - loader->current_position = loader->process_rank * B * T * sizeof(int); + if (loader->current_position + (loader->num_processes * B * T + 1) * sizeof(uint16_t) > loader->file_size) { + dataloader_reset(loader); } - // read the B*T+1 integers from the file into batch + // read B*T+1 uint16_t tokens from the file into buffer fseekCheck(loader->tokens_file, loader->current_position, SEEK_SET); - freadCheck(loader->batch, sizeof(int), B*T+1, loader->tokens_file); + freadCheck(loader->buffer, sizeof(uint16_t), B*T+1, loader->tokens_file); + // decode the buffer into inputs and targets (cast to int) + for (int i = 0; i < B*T; i++) { + loader->inputs[i] = (int)loader->buffer[i]; + loader->targets[i] = (int)loader->buffer[i+1]; + } // advance the current position by B*T*num_processes integers // note: the "stride" of tokens by which we move each time is definitely B * T - loader->current_position += loader->num_processes * B * T * sizeof(int); + loader->current_position += loader->num_processes * B * T * sizeof(uint16_t); } void dataloader_free(DataLoader *loader) { - free(loader->batch); + free(loader->buffer); + free(loader->inputs); + free(loader->targets); fcloseCheck(loader->tokens_file); } diff --git a/dev/data/data_common.py b/dev/data/data_common.py index c3147de76..ec85cb90b 100644 --- a/dev/data/data_common.py +++ b/dev/data/data_common.py @@ -23,8 +23,12 @@ def download_file(url: str, fname: str, chunk_size=1024): bar.update(size) -def write_shard(filename, toks): - """Saves token data as a .bin file, for reading in C""" +def write_datafile(filename, toks): + """ + Saves token data as a .bin file, for reading in C. + - First comes a header with 256 int32s + - The tokens follow, each as a uint16 + """ assert len(toks) < 2**31, "token count too large" # ~2.1B tokens # construct the header header = np.zeros(256, dtype=np.int32) @@ -37,7 +41,7 @@ def write_shard(filename, toks): # construct the tokens toks_np = np.array(toks, dtype=np.uint16) # write to file - print(f"writing {filename}") + print(f"writing {len(toks):,} tokens to {filename}") with open(filename, "wb") as f: f.write(header.tobytes()) f.write(toks_np.tobytes()) diff --git a/dev/data/fineweb.py b/dev/data/fineweb.py index 8b6ef4bfa..41091ba6a 100644 --- a/dev/data/fineweb.py +++ b/dev/data/fineweb.py @@ -25,7 +25,7 @@ from tqdm import tqdm import argparse -from data_common import write_shard +from data_common import write_datafile # ------------------------------------------ parser = argparse.ArgumentParser(description="FineWeb dataset preprocessing") @@ -75,7 +75,7 @@ def tokenize(doc): filename = os.path.join(DATA_CACHE_DIR, f"fineweb_{shard_index:06d}.bin") write_tokens = all_tokens[:args.shard_size] rest_tokens = all_tokens[args.shard_size:] - write_shard(filename, write_tokens) + write_datafile(filename, write_tokens) shard_index += 1 progress_bar = None # note: create a copy so Python can free the all_tokens memory above diff --git a/dev/data/tinyshakespeare.py b/dev/data/tinyshakespeare.py index 6d795aef7..6b7cbb976 100644 --- a/dev/data/tinyshakespeare.py +++ b/dev/data/tinyshakespeare.py @@ -17,7 +17,7 @@ import os import tiktoken import numpy as np -from data_common import download_file +from data_common import download_file, write_datafile # ----------------------------------------------------------------------------- DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "tinyshakespeare") @@ -52,13 +52,8 @@ def tokenize(): # save to file val_filename = os.path.join(DATA_CACHE_DIR, "tiny_shakespeare_val.bin") train_filename = os.path.join(DATA_CACHE_DIR, "tiny_shakespeare_train.bin") - with open(val_filename, "wb") as f: - f.write(val_tokens_np.tobytes()) - with open(train_filename, "wb") as f: - f.write(train_tokens_np.tobytes()) - # prints - print(f"Saved {len(val_tokens_np)} tokens to {val_filename}") - print(f"Saved {len(train_tokens_np)} tokens to {train_filename}") + write_datafile(val_filename, val_tokens_np) + write_datafile(train_filename, train_tokens_np) if __name__ == "__main__": download() diff --git a/dev/data/tinystories.py b/dev/data/tinystories.py index 628e5a7bb..83621e1d8 100644 --- a/dev/data/tinystories.py +++ b/dev/data/tinystories.py @@ -25,7 +25,7 @@ from concurrent.futures import ProcessPoolExecutor, as_completed import tiktoken import numpy as np -from data_common import download_file +from data_common import download_file, write_datafile # ----------------------------------------------------------------------------- DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "tinystories") @@ -96,9 +96,7 @@ def tokenize(): all_tokens_np = np.array(all_tokens, dtype=np.int32) split_filename = os.path.join(DATA_CACHE_DIR, f"TinyStories_{split_name}.bin") - with open(split_filename, "wb") as f: - f.write(all_tokens_np.tobytes()) - print(f"Saved {len(all_tokens_np)} tokens to {split_filename}") + write_datafile(split_filename, all_tokens_np) if __name__ == "__main__": download() From 666145e7b8ef033c4a7eeda6392d759235f64afb Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 21 May 2024 17:30:56 +0000 Subject: [PATCH 125/172] also fix the .c file and fp32 file --- train_gpt2.c | 84 +++------------------------------------------- train_gpt2_fp32.cu | 78 +++--------------------------------------- 2 files changed, 10 insertions(+), 152 deletions(-) diff --git a/train_gpt2.c b/train_gpt2.c index 41ec3147e..57296736b 100644 --- a/train_gpt2.c +++ b/train_gpt2.c @@ -25,6 +25,8 @@ There will be other versions of this code that specialize it and make it fast. #include "utils.h" // defines: tokenizer_init, tokenizer_decode, tokenizer_free #include "tokenizer.h" +// defines: dataloader_init, dataloader_reset, dataloader_next_batch, dataloader_free +#include "dataloader.h" // ---------------------------------------------------------------------------- // all the individual layers' forward and backward passes @@ -992,81 +994,6 @@ void gpt2_free(GPT2 *model) { #ifndef TESTING // if we are TESTING (see test_gpt2.c), we'll skip the int main below - -// ---------------------------------------------------------------------------- -// data loader lite -// returns random batches of data from a file of integers - -typedef struct { - // hyperparameters - int B; // batch size - int T; // sequence length - // input handling and its state - FILE* tokens_file; - long file_size; - long current_position; - // output memory - int* batch; - int* inputs; - int* targets; - // convenience variables - int num_batches; -} DataLoader; - -void dataloader_init(DataLoader *loader, const char* filename, int B, int T) { - loader->B = B; - loader->T = T; - - // open the input file for reading - loader->tokens_file = fopen(filename, "rb"); - if (loader->tokens_file == NULL) { - printf("Error opening tokens file\n"); - printf("--> HINT: the data directory may have moved recently from data/ to dev/data/(dataset)/"); - printf("--> HINT: refer again to the README file and possibly re-run the dataset prepro script."); - printf("--> HINT: example: re-run `python dev/data/tinyshakespeare.py`"); - exit(1); - } - - // determine the file size - fseekCheck(loader->tokens_file, 0, SEEK_END); - loader->file_size = ftell(loader->tokens_file); - fseekCheck(loader->tokens_file, 0, SEEK_SET); - if (loader->file_size < (B * T + 1) * sizeof(int)) { - printf("Error: file size is too small for the batch size and sequence length\n"); - exit(1); - } - loader->current_position = 0; // start at the beginning - - // allocate space for B*T + 1 integers to store the inputs and targets - loader->batch = (int*) mallocCheck((B * T + 1) * sizeof(int)); - loader->inputs = loader->batch; - loader->targets = loader->batch + 1; // targets are shifted by one - loader->num_batches = loader->file_size / (B * T * sizeof(int)); -} - -void dataloader_reset(DataLoader *loader) { - loader->current_position = 0; -} - -void dataloader_next_batch(DataLoader *loader) { - int B = loader->B; - int T = loader->T; - // if we are at the end of the file, loop back to the beginning - if (loader->current_position + (B*T+1) * sizeof(int) > loader->file_size) { - loader->current_position = 0; - } - // read the B*T+1 integers from the file into batch - fseekCheck(loader->tokens_file, loader->current_position, SEEK_SET); - freadCheck(loader->batch, sizeof(int), B*T+1, loader->tokens_file); - // advance the current position by B*T integers - loader->current_position += B*T * sizeof(int); -} - -void dataloader_free(DataLoader *loader) { - fcloseCheck(loader->tokens_file); - free(loader->batch); -} - // ---------------------------------------------------------------------------- // sampler @@ -1111,11 +1038,10 @@ int main() { const char* val_tokens = access(tiny_shakespeare_val, F_OK) != -1 ? tiny_shakespeare_val : tiny_stories_val; int B = 4; // batch size 4 (i.e. 4 independent token sequences will be trained on) int T = 64; // sequence length 64 (i.e. each sequence is 64 tokens long). must be <= maxT, which is 1024 for GPT-2 - DataLoader train_loader; - dataloader_init(&train_loader, train_tokens, B, T); + DataLoader train_loader, val_loader; + dataloader_init(&train_loader, train_tokens, B, T, 0, 1); + dataloader_init(&val_loader, val_tokens, B, T, 0, 1); printf("train dataset num_batches: %d\n", train_loader.num_batches); - DataLoader val_loader; - dataloader_init(&val_loader, val_tokens, B, T); printf("val dataset num_batches: %d\n", val_loader.num_batches); int val_num_batches = 5; diff --git a/train_gpt2_fp32.cu b/train_gpt2_fp32.cu index d2cf53b43..9a2dc6bb7 100644 --- a/train_gpt2_fp32.cu +++ b/train_gpt2_fp32.cu @@ -31,6 +31,8 @@ the layernorms are connected to the residuals so we += in layernorm backward. #include "utils.h" // defines: tokenizer_init, tokenizer_decode, tokenizer_free #include "tokenizer.h" +// defines: dataloader_init, dataloader_reset, dataloader_next_batch, dataloader_free +#include "dataloader.h" // ---------------------------------------------------------------------------- // CUDA utils @@ -1453,75 +1455,6 @@ void gpt2_free(GPT2 *model) { #ifndef TESTING // if we are TESTING (see test_gpt2.cu), we'll skip the int main below - -// ---------------------------------------------------------------------------- -// data loader lite: returns random batches of data from a file of integers - -typedef struct { - // hyperparameters - int B; - int T; - // input handling and its state - FILE* tokens_file; - long file_size; - long current_position; - // output memory - int* batch; - int* inputs; - int* targets; - // convenience variables - long num_batches; -} DataLoader; - -void dataloader_init(DataLoader *loader, const char* filename, int B, int T) { - loader->B = B; - loader->T = T; - - // open the input file for reading - loader->tokens_file = fopenCheck(filename, "rb"); - - // determine the file size - fseekCheck(loader->tokens_file, 0, SEEK_END); - loader->file_size = ftell(loader->tokens_file); - fseekCheck(loader->tokens_file, 0, SEEK_SET); - if (loader->file_size < (B * T + 1) * sizeof(int)) { - printf("Error: file size is too small for the batch size and sequence length\n"); - exit(EXIT_FAILURE); - } - loader->current_position = 0; // start at the beginning - - // allocate space for B*T + 1 integers to store the inputs and targets - // Using CUDA CPU pinned memory for faster PCI Express transfers to GPU - // See: https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/ - cudaMallocHost((void**)&loader->batch, (B * T + 1) * sizeof(int)); - loader->inputs = loader->batch; - loader->targets = loader->batch + 1; // targets are shifted by one - loader->num_batches = loader->file_size / (B * T * sizeof(int)); -} - -void dataloader_reset(DataLoader *loader) { - loader->current_position = 0; -} - -void dataloader_next_batch(DataLoader *loader) { - int B = loader->B; - int T = loader->T; - // if we are at the end of the file, loop back to the beginning - if (loader->current_position + (B*T+1) * sizeof(int) > loader->file_size) { - loader->current_position = 0; - } - // read the B*T+1 integers from the file into batch - fseekCheck(loader->tokens_file, loader->current_position, SEEK_SET); - freadCheck(loader->batch, sizeof(int), B*T+1, loader->tokens_file); - // advance the current position by B*T integers - loader->current_position += B*T * sizeof(int); -} - -void dataloader_free(DataLoader *loader) { - fcloseCheck(loader->tokens_file); - cudaFreeHost(loader->batch); -} - // ---------------------------------------------------------------------------- // sampler: takes probabilities and samples integers from them @@ -1689,10 +1622,9 @@ int main(int argc, char *argv[]) { assert(strlen(input_dataset_prefix) < 100); // being bit lazy here, make sure we don't overflow sprintf(train_tokens_filename, "%s_train.bin", input_dataset_prefix); sprintf(val_tokens_filename, "%s_val.bin", input_dataset_prefix); - DataLoader train_loader; - dataloader_init(&train_loader, train_tokens_filename, B, T); - DataLoader val_loader; - dataloader_init(&val_loader, val_tokens_filename, B, T); + DataLoader train_loader, val_loader; + dataloader_init(&train_loader, train_tokens_filename, B, T, 0, 1); + dataloader_init(&val_loader, val_tokens_filename, B, T, 0, 1); int train_num_batches = train_loader.num_batches; // let's do 1 epoch by default for now int val_num_batches = train_loader.num_batches < val_max_batches ? train_loader.num_batches : val_max_batches; printf("| train_num_batches | %-50d |\n", train_num_batches); From 9bad49a99a47ae777a0a5a211db1294ab3e2bd0d Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 21 May 2024 17:46:14 +0000 Subject: [PATCH 126/172] also fix the python file. that should be it now, w.r.t. the new token format .bin files --- train_gpt2.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/train_gpt2.py b/train_gpt2.py index 4d61e68cd..efb64695f 100644 --- a/train_gpt2.py +++ b/train_gpt2.py @@ -502,7 +502,14 @@ def print0(*args, **kwargs): exit(1) print0(f"loading cached tokens in {args.input_bin}") with open(args.input_bin, "rb") as f: - tokens = np.frombuffer(f.read(), dtype=np.int32) + # first read the header, which is 256 int32 integers (4 bytes each) + header = np.frombuffer(f.read(256*4), dtype=np.int32) + assert header[0] == 20240520, "magic number mismatch, corrupt file?" + assert header[1] == 1, "unsupported version" + ntok = header[2] # number of tokens (claimed) + # the rest of it are tokens, stored as uint16 + tokens = np.frombuffer(f.read(), dtype=np.uint16) + assert len(tokens) == ntok, "number of tokens read does not match header?" # np -> tensor, long, on device tokens = torch.tensor(tokens) From d53608820a610c3b338ab90fb5bd2f74ec35281e Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 21 May 2024 17:55:56 +0000 Subject: [PATCH 127/172] the write_datafile function accepts python list, which i think is faster but i didn't check --- dev/data/data_common.py | 2 +- dev/data/tinyshakespeare.py | 9 ++++----- dev/data/tinystories.py | 3 +-- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/dev/data/data_common.py b/dev/data/data_common.py index ec85cb90b..8bae1274d 100644 --- a/dev/data/data_common.py +++ b/dev/data/data_common.py @@ -38,7 +38,7 @@ def write_datafile(filename, toks): # validate that no token exceeds a uint16 maxtok = 2**16 assert all(0 <= t < maxtok for t in toks), "token dictionary too large for uint16" - # construct the tokens + # construct the tokens numpy array toks_np = np.array(toks, dtype=np.uint16) # write to file print(f"writing {len(toks):,} tokens to {filename}") diff --git a/dev/data/tinyshakespeare.py b/dev/data/tinyshakespeare.py index 6b7cbb976..d9b4b6e22 100644 --- a/dev/data/tinyshakespeare.py +++ b/dev/data/tinyshakespeare.py @@ -45,15 +45,14 @@ def tokenize(): text = text.replace('\n\n', '\n\n<|endoftext|>') # encode the text tokens = encode(text) - tokens_np = np.array(tokens, dtype=np.int32) # let's take the first 32,768 tokens as the validation split (~10%) - val_tokens_np = tokens_np[:32768] - train_tokens_np = tokens_np[32768:] + val_tokens = tokens[:32768] + train_tokens = tokens[32768:] # save to file val_filename = os.path.join(DATA_CACHE_DIR, "tiny_shakespeare_val.bin") train_filename = os.path.join(DATA_CACHE_DIR, "tiny_shakespeare_train.bin") - write_datafile(val_filename, val_tokens_np) - write_datafile(train_filename, train_tokens_np) + write_datafile(val_filename, val_tokens) + write_datafile(train_filename, train_tokens) if __name__ == "__main__": download() diff --git a/dev/data/tinystories.py b/dev/data/tinystories.py index 83621e1d8..fed8bc61c 100644 --- a/dev/data/tinystories.py +++ b/dev/data/tinystories.py @@ -94,9 +94,8 @@ def tokenize(): for future in as_completed(futures): all_tokens.extend(future.result()) - all_tokens_np = np.array(all_tokens, dtype=np.int32) split_filename = os.path.join(DATA_CACHE_DIR, f"TinyStories_{split_name}.bin") - write_datafile(split_filename, all_tokens_np) + write_datafile(split_filename, all_tokens) if __name__ == "__main__": download() From ccc240caab7ee7379a5f1aa4488cbb69e39fa469 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 21 May 2024 18:01:35 +0000 Subject: [PATCH 128/172] make comment more helpful --- dataloader.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/dataloader.h b/dataloader.h index fa4e62adc..110276929 100644 --- a/dataloader.h +++ b/dataloader.h @@ -57,7 +57,12 @@ void dataloader_init(DataLoader *loader, // validate the header int header[HEADER_SIZE]; freadCheck(header, sizeof(int), HEADER_SIZE, loader->tokens_file); - if (header[0] != 20240520) { printf("Bad magic in data file\n"); exit(EXIT_FAILURE); } + if (header[0] != 20240520) { + printf("Bad magic in the data file\n"); + printf("---> HINT: Are you passing in a correct file?\n"); + printf("---> HINT: The data encoding may have changed, re-run data prepro or refer again to README.\n"); + exit(EXIT_FAILURE); + } if (header[1] != 1) { printf("Bad version in data file\n"); exit(EXIT_FAILURE); } long ntok = header[2]; // number of tokens in the file From f7cb77f3d955143627fb36ffd072436b8c930d52 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 21 May 2024 18:20:51 +0000 Subject: [PATCH 129/172] docs on master-breaking change around how we store data .bin files --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index aee282fe0..14cdc8da1 100644 --- a/README.md +++ b/README.md @@ -373,6 +373,10 @@ Lastly, I will be a lot more sensitive to complexity in the root folder of the p - [llm.zig](https://github.com/Saimirbaci/llm.zig) by @[saimirbaci](https://github.com/Saimirbaci): a Zig port of this project +## major changes log + +- May 21, 2024: I refactored the .bin files that hold the tokens to include a header like all the other .bin files that e.g. store the model weights. This was necessary to support multiple versions and future development. Unfortunately, this will brick everyone's master the next time you `git pull`, because the .bin files you've generated before are the legacy version. To fix this, you only have to re-generate the data in the new format. For example, for Tiny Shakespeare run: `python dev/data/tinyshakespeare.py`. For Tiny Stories, `python dev/data/tinystories.py`. Also notice that the location of these data files has changed. They used to just be "flat" and inside `data/` folder, but now all the data-related code was moved to `dev/data` files and sub-directories, to keep things organized. Apologies for breaking change, I'll try not to brick master too much in general. + ## discussions Ways of organizing development: From 7d58fd2abb0672a6074f6e9c87bfff27676048cc Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 21 May 2024 18:29:55 +0000 Subject: [PATCH 130/172] adjust py file as well and make the errors better --- train_gpt2.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/train_gpt2.py b/train_gpt2.py index efb64695f..b2def106b 100644 --- a/train_gpt2.py +++ b/train_gpt2.py @@ -497,14 +497,19 @@ def print0(*args, **kwargs): # note we're using val by default instead of train split just because it is smaller/faster if not os.path.isfile(args.input_bin): print0(f"ERROR: input .bin file not found: {args.input_bin}") - print0("---> HINT: try to re-run the data prepro script. these recently moved to dev/data") - print0("---> HINT: for example re-run: `python dev/data/tinyshakespeare.py`, then re-try") + print0("---> HINT: Try to re-run the data prepro script. these recently moved to dev/data") + print0("---> HINT: For example re-run: `python dev/data/tinyshakespeare.py`, then re-try") exit(1) print0(f"loading cached tokens in {args.input_bin}") with open(args.input_bin, "rb") as f: # first read the header, which is 256 int32 integers (4 bytes each) header = np.frombuffer(f.read(256*4), dtype=np.int32) - assert header[0] == 20240520, "magic number mismatch, corrupt file?" + if header[0] != 20240520: + print0("ERROR: magic number mismatch in the data .bin file!") + print0("---> HINT: Are you passing in a correct file with --input_bin?") + print0("---> HINT: Dataset encoding changed recently, re-run data prepro or refer again to README") + print0("---> HINT: For example re-run: `python dev/data/tinyshakespeare.py`, then re-try") + exit(1) assert header[1] == 1, "unsupported version" ntok = header[2] # number of tokens (claimed) # the rest of it are tokens, stored as uint16 From 54ccbd300c49a4421144c448bfe5f83666c4cb9a Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 21 May 2024 18:31:19 +0000 Subject: [PATCH 131/172] docs on master-breaking changes around dataset file representation --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 14cdc8da1..a641afda0 100644 --- a/README.md +++ b/README.md @@ -375,7 +375,7 @@ Lastly, I will be a lot more sensitive to complexity in the root folder of the p ## major changes log -- May 21, 2024: I refactored the .bin files that hold the tokens to include a header like all the other .bin files that e.g. store the model weights. This was necessary to support multiple versions and future development. Unfortunately, this will brick everyone's master the next time you `git pull`, because the .bin files you've generated before are the legacy version. To fix this, you only have to re-generate the data in the new format. For example, for Tiny Shakespeare run: `python dev/data/tinyshakespeare.py`. For Tiny Stories, `python dev/data/tinystories.py`. Also notice that the location of these data files has changed. They used to just be "flat" and inside `data/` folder, but now all the data-related code was moved to `dev/data` files and sub-directories, to keep things organized. Apologies for breaking change, I'll try not to brick master too much in general. +- **May 21, 2024: Dataset refactor**. I refactored the .bin files that hold the tokens to include a header like all the other .bin files that e.g. store the model weights. This was necessary to support multiple versions and future development. Unfortunately, this will brick everyone's master the next time you `git pull`, because the .bin files you've generated before are the legacy version. To fix this, you only have to re-generate the data in the new format. For example, for Tiny Shakespeare run: `python dev/data/tinyshakespeare.py`. For Tiny Stories, `python dev/data/tinystories.py`. Also notice that the location of these data files has changed. They used to just be "flat" and inside `data/` folder, but now all the data-related code was moved to `dev/data` files and sub-directories, to keep things organized. Apologies for breaking change, I'll try not to brick master too much in general. ## discussions From 587506d09604ca76415fdf3ac5ab62ca0542d9a8 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 21 May 2024 19:22:09 +0000 Subject: [PATCH 132/172] torch tensor can't handle uint16 so let's convert to int32, which is silly because we'll convert to .long right after but ok --- train_gpt2.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/train_gpt2.py b/train_gpt2.py index b2def106b..f844004d2 100644 --- a/train_gpt2.py +++ b/train_gpt2.py @@ -514,6 +514,8 @@ def print0(*args, **kwargs): ntok = header[2] # number of tokens (claimed) # the rest of it are tokens, stored as uint16 tokens = np.frombuffer(f.read(), dtype=np.uint16) + # convert tokens to int32 because torch can't handle uint16 sad + tokens = tokens.astype(np.int32) assert len(tokens) == ntok, "number of tokens read does not match header?" # np -> tensor, long, on device From 967420d1d13109c50318a332a0e5c6cb9fef395a Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 21 May 2024 19:30:02 +0000 Subject: [PATCH 133/172] fix print format warning for size_t vs int --- train_gpt2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/train_gpt2.c b/train_gpt2.c index 57296736b..57bdfe929 100644 --- a/train_gpt2.c +++ b/train_gpt2.c @@ -1041,8 +1041,8 @@ int main() { DataLoader train_loader, val_loader; dataloader_init(&train_loader, train_tokens, B, T, 0, 1); dataloader_init(&val_loader, val_tokens, B, T, 0, 1); - printf("train dataset num_batches: %d\n", train_loader.num_batches); - printf("val dataset num_batches: %d\n", val_loader.num_batches); + printf("train dataset num_batches: %zu\n", train_loader.num_batches); + printf("val dataset num_batches: %zu\n", val_loader.num_batches); int val_num_batches = 5; // build the Tokenizer From 31310282e164902b7e37dc6883aa7e29666eb9df Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 21 May 2024 21:33:24 +0000 Subject: [PATCH 134/172] extend dataloader to be sharded --- dataloader.h | 110 +++++++++++++++++++++++++++++++------------- dev/data/fineweb.py | 3 +- train_gpt2.cu | 29 ++++++------ 3 files changed, 93 insertions(+), 49 deletions(-) diff --git a/dataloader.h b/dataloader.h index 110276929..a9864fbe7 100644 --- a/dataloader.h +++ b/dataloader.h @@ -2,6 +2,7 @@ Implements a medium simple DataLoader for a distributed training setup. */ +#include #include #include #include @@ -23,6 +24,8 @@ typedef struct { size_t B; size_t T; // input handling and its state + glob_t glob_result; // stores the result of glob, for all shards we want to iterate + int current_shard; // the current shard we are reading from FILE* tokens_file; long file_size; long current_position; @@ -34,25 +37,13 @@ typedef struct { size_t num_batches; } DataLoader; -void dataloader_reset(DataLoader *loader) { - // each process starts at a different offset in the file - long header_bytes = HEADER_SIZE * sizeof(int); - long token_bytes_offset = loader->process_rank * loader->B * loader->T * sizeof(uint16_t); - loader->current_position = header_bytes + token_bytes_offset; -} - -void dataloader_init(DataLoader *loader, - const char* filename, - size_t B, - size_t T, - int process_rank, - int num_processes) { - loader->process_rank = process_rank; - loader->num_processes = num_processes; - loader->B = B; - loader->T = T; - - // open the input file for reading +long dataloader_load_shard_(DataLoader *loader, int shard_index) { + // use the first glob match as the filename for now + const char* filename = loader->glob_result.gl_pathv[shard_index]; + // open the input file for reading. also only a single file can be opened at a time + if (loader->tokens_file != NULL) { + fcloseCheck(loader->tokens_file); + } loader->tokens_file = fopenCheck(filename, "rb"); // validate the header int header[HEADER_SIZE]; @@ -65,7 +56,7 @@ void dataloader_init(DataLoader *loader, } if (header[1] != 1) { printf("Bad version in data file\n"); exit(EXIT_FAILURE); } long ntok = header[2]; // number of tokens in the file - + assert(ntok > 0); // we expect some tokens in the file. this should never trip, right? // determine the file size and make sure it is consistent with the number of tokens fseekCheck(loader->tokens_file, 0, SEEK_END); // seek to end of file loader->file_size = ftell(loader->tokens_file); // read the offset, i.e. file size @@ -76,31 +67,80 @@ void dataloader_init(DataLoader *loader, printf("Error: file size is not as expected\n"); exit(EXIT_FAILURE); } - if (ntok < num_processes * B * T + 1) { - // being too defensive/lazy, we could tolerate as low as T+1 tokens in principle - printf("Error: there are too few tokens\n"); + return ntok; +} + +void dataloader_reset(DataLoader *loader) { + // fully resets the DataLoader object to init configuration + // each process starts at a different offset in the file + long header_bytes = HEADER_SIZE * sizeof(int); + long token_bytes_offset = loader->process_rank * loader->B * loader->T * sizeof(uint16_t); + loader->current_shard = 0; + loader->current_position = header_bytes + token_bytes_offset; + dataloader_load_shard_(loader, loader->current_shard); +} + +void dataloader_advance_(DataLoader *loader) { + // advance the loader by loading the next data shard and resetting the position + if (loader->glob_result.gl_pathc > 1) { + // if we have more than one shard, advance to the next one + loader->current_shard = (loader->current_shard + 1) % loader->glob_result.gl_pathc; + dataloader_load_shard_(loader, loader->current_shard); + } + long header_bytes = HEADER_SIZE * sizeof(int); + long token_bytes_offset = loader->process_rank * loader->B * loader->T * sizeof(uint16_t); + loader->current_position = header_bytes + token_bytes_offset; +} + +void dataloader_init(DataLoader *loader, + const char* filename_pattern, + size_t B, + size_t T, + int process_rank, + int num_processes) { + loader->process_rank = process_rank; + loader->num_processes = num_processes; + loader->B = B; + loader->T = T; + loader->tokens_file = NULL; + + // glob to get the list of files matching the pattern, these are our data shards + int glob_status = glob(filename_pattern, 0, NULL, &loader->glob_result); + if (glob_status != 0) { + printf("Error: failed to glob pattern: %s\n", filename_pattern); + exit(EXIT_FAILURE); + } + if (loader->glob_result.gl_pathc == 0) { + printf("Error: no files found matching the pattern: %s\n", filename_pattern); exit(EXIT_FAILURE); } - // allocate space for B*T + 1 integers to store the inputs and targets + // inspect and validate all shards so we don't get any runtime errors later + // if too slow / too many shards, may wish to revisit later + long ntok_total = 0; + for (int shard_index = 0; shard_index < loader->glob_result.gl_pathc; shard_index++) { + long shard_ntok = dataloader_load_shard_(loader, shard_index); + // we need at least one batch/shard, the way things are written right now. + // can be relaxed a lot later. + assert(shard_ntok >= num_processes * B * T + 1); + ntok_total += shard_ntok; + } + printf("DataLoader: filename_pattern: %s\n", filename_pattern); + printf("DataLoader: Found %ld tokens across %zu shards\n", ntok_total, loader->glob_result.gl_pathc); + + // allocate all the space we'll need loader->buffer = (uint16_t*)malloc((B * T + 1) * sizeof(uint16_t)); loader->inputs = (int*)malloc(B * T * sizeof(int)); loader->targets = (int*)malloc(B * T * sizeof(int)); - // note: we definitely want to advance by B * T; That is the "stride" by which we move - // the window of tokens. We only load B * T + 1 tokens because our targets are offset by 1 - loader->num_batches = ntok / (num_processes * B * T); + loader->num_batches = ntok_total / (num_processes * B * T); // useful to know - // reset the loader to the beginning of the file + // reset the loader, to initialize it dataloader_reset(loader); } void dataloader_next_batch(DataLoader *loader) { size_t B = loader->B; size_t T = loader->T; - // if we are at the end of the file, loop back to the beginning - if (loader->current_position + (loader->num_processes * B * T + 1) * sizeof(uint16_t) > loader->file_size) { - dataloader_reset(loader); - } // read B*T+1 uint16_t tokens from the file into buffer fseekCheck(loader->tokens_file, loader->current_position, SEEK_SET); freadCheck(loader->buffer, sizeof(uint16_t), B*T+1, loader->tokens_file); @@ -111,7 +151,12 @@ void dataloader_next_batch(DataLoader *loader) { } // advance the current position by B*T*num_processes integers // note: the "stride" of tokens by which we move each time is definitely B * T + // we only load B * T + 1 tokens at each iteration because the targets are offset by 1 loader->current_position += loader->num_processes * B * T * sizeof(uint16_t); + // if the next batch would go past the end of the file, advance the loader + if (loader->current_position + (loader->num_processes * B * T + 1) * sizeof(uint16_t) > loader->file_size) { + dataloader_advance_(loader); + } } void dataloader_free(DataLoader *loader) { @@ -119,4 +164,5 @@ void dataloader_free(DataLoader *loader) { free(loader->inputs); free(loader->targets); fcloseCheck(loader->tokens_file); + globfree(&loader->glob_result); } diff --git a/dev/data/fineweb.py b/dev/data/fineweb.py index 41091ba6a..8369112a4 100644 --- a/dev/data/fineweb.py +++ b/dev/data/fineweb.py @@ -72,7 +72,8 @@ def tokenize(doc): # if we reach shard_size tokens, write shard to disk if len(all_tokens) >= args.shard_size: - filename = os.path.join(DATA_CACHE_DIR, f"fineweb_{shard_index:06d}.bin") + split = "val" if shard_index == 0 else "train" + filename = os.path.join(DATA_CACHE_DIR, f"fineweb_{split}_{shard_index:06d}.bin") write_tokens = all_tokens[:args.shard_size] rest_tokens = all_tokens[args.shard_size:] write_datafile(filename, write_tokens) diff --git a/train_gpt2.cu b/train_gpt2.cu index f3dfca5c2..4151b22c5 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -2541,12 +2541,10 @@ void logger_free(Logger *logger) { // CLI, poor man's argparse void error_usage() { - // default run = debugging run with TinyShakespeare - // bigger run = train on TinyStories! e.g. val/sample less often, but sample more tokens, write to logfile fprintf(stderr, "Usage: ./train_gpt2cu [options]\n"); - fprintf(stderr, "Example: ./train_gpt2cu -i dev/data/tinystories/TinyStories -v 100 -s 100 -g 144 -o stories.log\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -i input dataset prefix (default = dev/data/tinyshakespeare/tiny_shakespeare)\n"); + fprintf(stderr, " -i train data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_train.bin)\n"); + fprintf(stderr, " -j val data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_val.bin)\n"); fprintf(stderr, " -e input model filename (default = gpt2_124M_bf16.bin)\n"); fprintf(stderr, " -o output log file (default = NULL)\n"); fprintf(stderr, " -b (per-GPU, micro) batch size B (default = 4)\n"); @@ -2572,7 +2570,8 @@ int main(int argc, char *argv[]) { multi_gpu_config = multi_gpu_config_init(&argc, &argv); // read in the (optional) command line arguments - const char* input_dataset_prefix = "dev/data/tinyshakespeare/tiny_shakespeare"; // or e.g. data/TinyStories + const char* train_data_pattern = "dev/data/tinyshakespeare/tiny_shakespeare_train.bin"; + const char* val_data_pattern = "dev/data/tinyshakespeare/tiny_shakespeare_val.bin"; const char* load_filename = "gpt2_124M_bf16.bin"; // bf16 weights of the model const char* output_log_file = NULL; int B = 4; // batch size @@ -2595,7 +2594,8 @@ int main(int argc, char *argv[]) { if (argv[i][0] != '-') { error_usage(); } // must start with dash if (strlen(argv[i]) != 2) { error_usage(); } // must be -x (one dash, one letter) // read in the args - if (argv[i][1] == 'i') { input_dataset_prefix = argv[i+1]; } + if (argv[i][1] == 'i') { train_data_pattern = argv[i+1]; } + else if (argv[i][1] == 'j') { val_data_pattern = argv[i+1]; } else if (argv[i][1] == 'e') { load_filename = argv[i+1]; } else if (argv[i][1] == 'o') { output_log_file = argv[i+1]; } else if (argv[i][1] == 'b') { B = atoi(argv[i+1]); } // Per-GPU (micro) batch size @@ -2617,10 +2617,14 @@ int main(int argc, char *argv[]) { } // calculate a sensible default for total batch size by assuming no gradient accumulation if (total_batch_size == -1) { total_batch_size = B * T * multi_gpu_config.num_processes; } + // if we're only overfitting a single batch for debugging, let's overfit the first batch + // from val instead of train split, because val is smaller and faster. (train_gpt2.py does the same) + if (overfit_single_batch == 1) { train_data_pattern = val_data_pattern; } printf0("+-----------------------+----------------------------------------------------+\n"); printf0("| Parameter | Value |\n"); printf0("+-----------------------+----------------------------------------------------+\n"); - printf0("| input dataset prefix | %-50s |\n", input_dataset_prefix); + printf0("| train data pattern | %-50s |\n", train_data_pattern); + printf0("| val data pattern | %-50s |\n", val_data_pattern); printf0("| output log file | %-50s |\n", output_log_file == NULL ? "NULL" : output_log_file); printf0("| micro batch size B | %-50d |\n", B); printf0("| sequence length T | %-50d |\n", T); @@ -2663,16 +2667,9 @@ int main(int argc, char *argv[]) { printf0("+-----------------------+----------------------------------------------------+\n"); // build DataLoaders for both train and val - char train_tokens_filename[128], val_tokens_filename[128]; - assert(strlen(input_dataset_prefix) < 100); // being bit lazy here, make sure we don't overflow - // if we're only overfitting a single batch for debugging, let's overfit the first batch - // from val instead of train split, because val is smaller and a bit faster - const char* train_split = (overfit_single_batch == 1) ? "val" : "train"; - sprintf(train_tokens_filename, "%s_%s.bin", input_dataset_prefix, train_split); - sprintf(val_tokens_filename, "%s_val.bin", input_dataset_prefix); DataLoader train_loader, val_loader; - dataloader_init(&train_loader, train_tokens_filename, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes); - dataloader_init(&val_loader, val_tokens_filename, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes); + dataloader_init(&train_loader, train_data_pattern, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes); + dataloader_init(&val_loader, val_data_pattern, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes); int train_num_batches = (max_steps == -1) ? train_loader.num_batches : max_steps; // default = 1 epoch int val_num_batches = train_loader.num_batches < val_max_batches ? train_loader.num_batches : val_max_batches; printf0("| train_num_batches | %-50d |\n", train_num_batches); From 7d0891f6ddebebeefc8a9a5c3f319484aa31f1d5 Mon Sep 17 00:00:00 2001 From: ademeure Date: Tue, 21 May 2024 22:37:09 +0100 Subject: [PATCH 135/172] Fully deterministic layernorm (slight perf loss) --- train_gpt2.cu | 110 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 82 insertions(+), 28 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index 1e8b54be2..6c60b8a74 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -980,30 +980,34 @@ __global__ void reduce_add_sum_kernel(floatX* dst, const float* src, size_t n, s } } -__global__ void __launch_bounds__(512, 3) // todo - any warnings on Turing with only 1024 threads? - layernorm_backward_kernel8(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch, +__global__ void __launch_bounds__(512, 2) // todo - any warnings on Turing with only 1024 threads? + layernorm_backward_kernel9(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch, const floatX* dout, const floatX* inp, const floatX* weight, const floatX* mean, const floatX* rstd, int B, int T, int C) { + constexpr int BLOCK_SIZE = 512; + constexpr int warpsInBlock = BLOCK_SIZE / WARP_SIZE; //number of warps in block extern __shared__ float shared[]; // size = 2 * C + 1 + int warpId = threadIdx.x / WARP_SIZE; // warp index within a block - int warpsInBlock = blockDim.x / WARP_SIZE; //number of warps in block int baseIdx = blockIdx.x * warpsInBlock + warpId; int warpThreadIdx = threadIdx.x % WARP_SIZE; // Thread index within the warp int warpsInGrid = gridDim.x * warpsInBlock; int C_per_iteration = WARP_SIZE * x128::size; - int iterations_C = C / C_per_iteration; + int iterations_C = CEIL_DIV(C, C_per_iteration); // the first half of shared memory is bias, second is weight float* dbias_shared = shared; float* dweight_shared = shared + C; + float* dbias_tmp_shared = shared + 2 * C; + float* dweight_tmp_shared = shared + 2 * C + BLOCK_SIZE; // init shared memory to zero - for(int i = threadIdx.x; i < C; i+= blockDim.x){ + for(int i = threadIdx.x; i < C; i+= BLOCK_SIZE){ dbias_shared[i] = 0.0f; dweight_shared[i] = 0.0f; } - unsigned int *tmp_flag = (unsigned int*)(shared + C*2); + unsigned int *tmp_flag = (unsigned int*)(shared + 2*C + 2*BLOCK_SIZE); __syncthreads(); for (int idx = baseIdx; idx < B * T; idx += warpsInGrid) { @@ -1041,6 +1045,10 @@ __global__ void __launch_bounds__(512, 3) // todo - any warnings on Turing with for (int i = 0; i < iterations_C; i++) { int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration); int shared_index = warpThreadIdx + (i * C_per_iteration); + if (global_index >= C) { + break; + } + x128 dout128 = load128cs(dout_bt + global_index); x128 inp128 = load128cs(inp_bt + global_index); x128 dinp128 = load128(dinp_bt + global_index); @@ -1050,10 +1058,29 @@ __global__ void __launch_bounds__(512, 3) // todo - any warnings on Turing with float dout_i = (float)dout128[x]; float norm_bti = ((float)inp128[x] - mean_bt) * rstd_bt; float dnorm_i = (float)weight128[x] * dout_i; - // gradient contribution to bias (using shared memory friendly index) - atomicAdd(&dbias_shared[shared_index + x*WARP_SIZE], dout_i); - // gradient contribution to weight (using shared memory friendly index) - atomicAdd(&dweight_shared[shared_index + x*WARP_SIZE], norm_bti * dout_i); + + // sum up the gradients for bias and weight across the entire block + // this is basically a reduction (but only inter-warp, not intra-warp) + // doing it this way allows us to avoid using atomics while using many warps + if (warpId != 0) { + dbias_tmp_shared[threadIdx.x] = dout_i; + dweight_tmp_shared[threadIdx.x] = norm_bti * dout_i; + } + __syncthreads(); + if (warpId == 0) { + float dbias_tmp = dout_i; + float dweight_tmp = norm_bti * dout_i; + for (int j = 1; j < warpsInBlock; j++) { + dbias_tmp += dbias_tmp_shared[threadIdx.x + j * WARP_SIZE]; + dweight_tmp += dweight_tmp_shared[threadIdx.x + j * WARP_SIZE]; + } + // gradient contribution to bias (using shared memory friendly index) + dbias_shared[shared_index + x*WARP_SIZE] += dbias_tmp; + // gradient contribution to weight (using shared memory friendly index) + dweight_shared[shared_index + x*WARP_SIZE] += dweight_tmp; + } + __syncthreads(); + // gradient contribution to input float dval = 0.0f; dval += dnorm_i; // term 1 @@ -1066,35 +1093,64 @@ __global__ void __launch_bounds__(512, 3) // todo - any warnings on Turing with store128cg(dinp_bt + global_index, dinp128); } } - // Accumulate into a FP32 scratchpad - // BF16 atomics are potentially much slower... and this is more precise! - // todo - could potentially avoid the extra copy if floatX is FP32, fairly negligible though __syncthreads(); + // Each block writes its partial sum to global memory + // The last block to finish becomes responsible for summing up all the partial sums + // This is done by atomically incrementing a flag (cleared to 0 before launching the kernel) + unsigned int* scratchFlag = (unsigned int*)(scratch); + // Increment scratch pointer by a full cacheline so that everything remains cacheline aligned + scratch += 32; float* scratch_dbias = scratch; float* scratch_dweight = scratch + C; - unsigned int* scratchFlag = (unsigned int*)(scratch + (2 * C)); - for(int i = threadIdx.x; i < C; i+= blockDim.x) { - // global atomics in the same "shared memory banking friendly" order - atomicAdd(&scratch_dbias[i], dbias_shared[i]); - atomicAdd(&scratch_dweight[i], dweight_shared[i]); + for(int i = threadIdx.x; i < C; i+= BLOCK_SIZE) { + // Write to global memory in the same "shared memory banking friendly" order + scratch_dbias[i + 2*C*blockIdx.x] = dbias_shared[i]; + scratch_dweight[i + 2*C*blockIdx.x] = dweight_shared[i]; } + __syncthreads(); if (threadIdx.x == 0) { *tmp_flag = atomicInc(scratchFlag, gridDim.x); } __syncthreads(); if (*tmp_flag == gridDim.x-1) { + // Reduction of the partial sums by the final block + // todo - there isn't enough parallelism even inside that single SM... + // ==> so could maybe split into another kernel with YET ANOTHER level of reduction?! + for(int i = threadIdx.x * f128::size; i < C; i+= BLOCK_SIZE * f128::size) { + f128 dbias_accum(make_int4(0, 0, 0, 0)); + f128 dweight_accum(make_int4(0, 0, 0, 0)); + + for (int read_block_idx = 0; read_block_idx < gridDim.x; read_block_idx++) { + int offset = i + 2*C*read_block_idx; + f128 dbias128 = load128(scratch_dbias + offset); + f128 dweight128 = load128(scratch_dweight + offset); + for(int k = 0; k < f128::size; k++) { + dbias_accum[k] += dbias128[k]; + dweight_accum[k] += dweight128[k]; + } + } + store128(dbias_shared + i, dbias_accum); + store128(dweight_shared + i, dweight_accum); + } + __syncthreads(); + + // reorder from atomic/shared memory-friendly index to real global memory index + // and convert from float/FP32 to floatX/BF16 for the final write + // this is separate also because it cannot use as many warps as the above (f128 vs x128) + // todo - if we split this code into another kernel, we could maybe do it at the same time? for (int i = warpId; i < iterations_C; i += warpsInBlock) { - // reorder from atomic/shared memory-friendly index to real global memory index - // and convert from float/FP32 to floatX/BF16 for the final write int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration); int shared_index = warpThreadIdx + (i * C_per_iteration); + if (global_index >= C) { + break; + } x128 dbias128 = load128(dbias + global_index); x128 dweight128 = load128(dweight + global_index); for (int x = 0; x < x128::size; x++) { - float s_db = scratch_dbias[shared_index + x*WARP_SIZE]; - float s_dw = scratch_dweight[shared_index + x*WARP_SIZE]; + float s_db = dbias_shared[shared_index + x*WARP_SIZE]; + float s_dw = dweight_shared[shared_index + x*WARP_SIZE]; dbias128[x] = (floatX)(s_db + (float)dbias128[x]); dweight128[x] = (floatX)(s_dw + (float)dweight128[x]); } @@ -1603,15 +1659,13 @@ void layernorm_backward(floatX* dinp, floatX* dweight, floatX* dbias, float* scr const floatX* dout, const floatX* inp, const floatX* weight, const floatX* mean, const floatX* rstd, int B, int T, int C) { NVTX_RANGE_FN(); - // todo - forcing 3 x 512 threads per SM maximum is a bit hacky, but more than that results in - // cache thrashing and lower performance on A100... is there a better way? const int block_size = 512; - const int blocks_per_sm = min(3, (deviceProp.maxThreadsPerMultiProcessor / 1024)); + const int blocks_per_sm = 2; // supported on every architecture and less cache thrashing than 3 const int grid_size = blocks_per_sm * deviceProp.multiProcessorCount; - size_t shared_mem_size = (2 * C + 1) * sizeof(float); + size_t shared_mem_size = (2*C + 2*block_size + 1) * sizeof(float); // see kernel - cudaMemset(scratch, 0, (2 * C + 1) * sizeof(float)); - layernorm_backward_kernel8<<>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C); + cudaMemset(scratch, 0, 1 * sizeof(float)); // only need to reset the flag to 0 + layernorm_backward_kernel9<<>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C); cudaCheck(cudaGetLastError()); } From 7cbeefc7f371412bbaca2990abbf5873bb8547ae Mon Sep 17 00:00:00 2001 From: ademeure Date: Tue, 21 May 2024 23:26:54 +0100 Subject: [PATCH 136/172] added new layernorm backward to /dev/cuda/ --- dev/cuda/layernorm_backward.cu | 199 ++++++++++++++++++++++++++++++++- train_gpt2.cu | 20 ++-- 2 files changed, 206 insertions(+), 13 deletions(-) diff --git a/dev/cuda/layernorm_backward.cu b/dev/cuda/layernorm_backward.cu index 90dcb1674..d9502880b 100644 --- a/dev/cuda/layernorm_backward.cu +++ b/dev/cuda/layernorm_backward.cu @@ -856,6 +856,185 @@ __global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS) } } +__global__ void layernorm_backward_kernel9(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch, + const floatX* dout, const floatX* inp, const floatX* weight, + const floatX* mean, const floatX* rstd, + int B, int T, int C) { + constexpr int WARP_SIZE = 32; + int BLOCK_SIZE = blockDim.x; + int warpsInBlock = BLOCK_SIZE / WARP_SIZE; //number of warps in block + extern __shared__ float shared[]; // size = 2 * C + 1 + + int warpId = threadIdx.x / WARP_SIZE; // warp index within a block + int baseIdx = blockIdx.x * warpsInBlock + warpId; + int warpThreadIdx = threadIdx.x % WARP_SIZE; // Thread index within the warp + int warpsInGrid = gridDim.x * warpsInBlock; + int C_per_iteration = WARP_SIZE * x128::size; + int iterations_C = ceil_div(C, C_per_iteration) + 2; + + // the first half of shared memory is bias, second is weight + float* dbias_shared = shared; + float* dweight_shared = shared + C; + float* dbias_tmp_shared = shared + 2 * C; + float* dweight_tmp_shared = shared + 2 * C + BLOCK_SIZE; + + // init shared memory to zero + for(int i = threadIdx.x; i < C; i+= BLOCK_SIZE){ + dbias_shared[i] = 0.0f; + dweight_shared[i] = 0.0f; + } + unsigned int *tmp_flag = (unsigned int*)(shared + 2*C + 2*BLOCK_SIZE); + __syncthreads(); + + for (int idx = baseIdx; idx < B * T; idx += warpsInGrid) { + int b = idx / T; + int t = idx % T; + + const floatX* dout_bt = dout + b * T * C + t * C; + const floatX* inp_bt = inp + b * T * C + t * C; + floatX* dinp_bt = dinp + b * T * C + t * C; + const float mean_bt = (float)mean[b * T + t]; + const float rstd_bt = (float)rstd[b * T + t]; + + // first: two reduce operations + float dnorm_mean = 0.0f; + float dnorm_norm_mean = 0.0f; + for (int i = warpThreadIdx * x128::size; i < C; i += WARP_SIZE * x128::size) { + x128 dout128_i = load128(dout_bt + i); + x128 inp128_i = load128(inp_bt + i); + x128 weight128_i = load128(weight + i); + for (int k = 0; k < x128::size; k++) { + float norm_bti = ((float)inp128_i[k] - mean_bt) * rstd_bt; + float dnorm_i = (float)weight128_i[k] * (float)dout128_i[k]; + dnorm_mean += dnorm_i; + dnorm_norm_mean += dnorm_i * norm_bti; + } + } + dnorm_mean = warpReduceSum(dnorm_mean) / C; + dnorm_norm_mean = warpReduceSum(dnorm_norm_mean) / C; + + // now iterate again and accumulate all the gradients + // unfortunately we cannot use the same index for x128 arrays and shared memory + // as atomics can only be 32-bit rather than 128-bit (at least pre-SM90/Hopper) + // so this would result in an 8-way bank conflict, and kill performance + // so instead, we use a shared memory friendly index, and reorder before the final write + for (int i = 0; i < iterations_C; i++) { + int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration); + int shared_index = warpThreadIdx + (i * C_per_iteration); + if (global_index >= C) { + break; + } + + x128 dout128 = load128cs(dout_bt + global_index); + x128 inp128 = load128cs(inp_bt + global_index); + x128 dinp128 = load128(dinp_bt + global_index); + x128 weight128 = load128(weight + global_index); + + for (int x = 0; x < x128::size; x++) { + float dout_i = (float)dout128[x]; + float norm_bti = ((float)inp128[x] - mean_bt) * rstd_bt; + float dnorm_i = (float)weight128[x] * dout_i; + + // sum up the gradients for bias and weight across the entire block + // this is basically a reduction (but only inter-warp, not intra-warp) + // doing it this way allows us to avoid using atomics while using many warps + if (warpId != 0) { + dbias_tmp_shared[threadIdx.x] = dout_i; + dweight_tmp_shared[threadIdx.x] = norm_bti * dout_i; + } + __syncthreads(); + if (warpId == 0) { + float dbias_tmp = dout_i; + float dweight_tmp = norm_bti * dout_i; + for (int j = 1; j < warpsInBlock; j++) { + dbias_tmp += dbias_tmp_shared[threadIdx.x + j * WARP_SIZE]; + dweight_tmp += dweight_tmp_shared[threadIdx.x + j * WARP_SIZE]; + } + // gradient contribution to bias (using shared memory friendly index) + dbias_shared[shared_index + x*WARP_SIZE] += dbias_tmp; + // gradient contribution to weight (using shared memory friendly index) + dweight_shared[shared_index + x*WARP_SIZE] += dweight_tmp; + } + __syncthreads(); + + // gradient contribution to input + float dval = 0.0f; + dval += dnorm_i; // term 1 + dval -= dnorm_mean; // term 2 + dval -= norm_bti * dnorm_norm_mean; // term 3 + dval *= rstd_bt; // final scale + dinp128[x] = (floatX)((float)dinp128[x] + dval); + } + // cache in L2 as this is read by the next kernel, but bypass L1 to minimise thrashing + store128cg(dinp_bt + global_index, dinp128); + } + } + __syncthreads(); + // Each block writes its partial sum to global memory + // The last block to finish becomes responsible for summing up all the partial sums + // This is done by atomically incrementing a flag (cleared to 0 before launching the kernel) + unsigned int* scratchFlag = (unsigned int*)(scratch); + // Increment scratch pointer by a full cacheline so that everything remains cacheline aligned + scratch += 32; + float* scratch_dbias = scratch; + float* scratch_dweight = scratch + C; + for(int i = threadIdx.x; i < C; i+= BLOCK_SIZE) { + // Write to global memory in the same "shared memory banking friendly" order + scratch_dbias[i + 2*C*blockIdx.x] = dbias_shared[i]; + scratch_dweight[i + 2*C*blockIdx.x] = dweight_shared[i]; + } + __syncthreads(); + if (threadIdx.x == 0) { + *tmp_flag = atomicInc(scratchFlag, gridDim.x); + } + __syncthreads(); + if (*tmp_flag == gridDim.x-1) { + // Reduction of the partial sums by the final block + // todo - there isn't enough parallelism even inside that single SM... + // ==> so could maybe split into another kernel with YET ANOTHER level of reduction?! + for(int i = threadIdx.x * f128::size; i < C; i+= BLOCK_SIZE * f128::size) { + f128 dbias_accum(make_int4(0, 0, 0, 0)); + f128 dweight_accum(make_int4(0, 0, 0, 0)); + + for (int read_block_idx = 0; read_block_idx < gridDim.x; read_block_idx++) { + int offset = i + 2*C*read_block_idx; + f128 dbias128 = load128(scratch_dbias + offset); + f128 dweight128 = load128(scratch_dweight + offset); + for(int k = 0; k < f128::size; k++) { + dbias_accum[k] += dbias128[k]; + dweight_accum[k] += dweight128[k]; + } + } + store128(dbias_shared + i, dbias_accum); + store128(dweight_shared + i, dweight_accum); + } + __syncthreads(); + + // reorder from atomic/shared memory-friendly index to real global memory index + // and convert from float/FP32 to floatX/BF16 for the final write + // this is separate also because it cannot use as many warps as the above (f128 vs x128) + // todo - if we split this code into another kernel, we could maybe do it at the same time? + for (int i = warpId; i < iterations_C; i += warpsInBlock) { + int global_index = (warpThreadIdx * x128::size) + (i * C_per_iteration); + int shared_index = warpThreadIdx + (i * C_per_iteration); + if (global_index >= C) { + break; + } + + x128 dbias128 = load128(dbias + global_index); + x128 dweight128 = load128(dweight + global_index); + for (int x = 0; x < x128::size; x++) { + float s_db = dbias_shared[shared_index + x*WARP_SIZE]; + float s_dw = dweight_shared[shared_index + x*WARP_SIZE]; + dbias128[x] = (floatX)(s_db + (float)dbias128[x]); + dweight128[x] = (floatX)(s_dw + (float)dweight128[x]); + } + store128(dbias + global_index, dbias128); + store128(dweight + global_index, dweight128); + } + } +} + // ---------------------------------------------------------------------------- // kernel launchers @@ -947,6 +1126,18 @@ void layernorm_backward8(Tdinp* dinp, Tparams* dweight, Tparams* dbias, float* s layernorm_backward_kernel8<<>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C); } +template +void layernorm_backward9(Tdinp* dinp, Tparams* dweight, Tparams* dbias, float* scratch, + const Tdout* dout, const Trest* inp, const Tparams* weight, const Trest* mean, const Trest* rstd, + int B, int T, int C, int block_size) { + + const int grid_size = (1024/block_size) * cuda_num_SMs; // todo - heuristics for other GPUs? + size_t shared_mem_size = (2 * C + 2 * block_size + 1) * sizeof(float); + + cudaMemset(scratch, 0, 1 * sizeof(float)); // just need to memset the flag for this version + layernorm_backward_kernel9<<>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C); +} + // kernel version dispatch void layernorm_backward(int kernel_num, floatX* dinp, floatX* dweight, floatX* dbias, float* scratch, @@ -982,6 +1173,9 @@ void layernorm_backward(int kernel_num, case 8: layernorm_backward8(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C, block_size); break; + case 9: + layernorm_backward9(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C, block_size); + break; default: printf("Invalid kernel number\n"); exit(1); @@ -1042,7 +1236,7 @@ int main(int argc, char **argv) { cudaCheck(cudaMalloc(&d_weight, C * sizeof(floatX))); cudaCheck(cudaMalloc(&d_mean, B * T * sizeof(floatX))); cudaCheck(cudaMalloc(&d_rstd, B * T * sizeof(floatX))); - cudaCheck(cudaMalloc(&d_scratch, cuda_num_SMs * (2 * C + 1) * sizeof(float))); + cudaCheck(cudaMalloc(&d_scratch, (1024/32) * cuda_num_SMs * (2 * C + 1) * sizeof(float))); // copy over the "inputs" to the backward call cudaCheck(memcpy_convert(d_dout, dout, B * T * C)); cudaCheck(memcpy_convert(d_inp, inp, B * T * C)); @@ -1051,7 +1245,8 @@ int main(int argc, char **argv) { cudaCheck(memcpy_convert(d_rstd, rstd, B * T)); // launch the kernel - int block_sizes[] = {32, 64, 128, 256, 512, 768, 1024}; + // removed 768 because it doesn't work for kernel9 despite being OK in train_gpt2.cu?! + int block_sizes[] = {32, 64, 128, 256, 512, /*768,*/ 1024}; for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) { int block_size = block_sizes[j]; // init the "outputs" of the backward call to zeros diff --git a/train_gpt2.cu b/train_gpt2.cu index 6c60b8a74..31b6db2b7 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -985,10 +985,8 @@ __global__ void __launch_bounds__(512, 2) // todo - any warnings on Turing with const floatX* dout, const floatX* inp, const floatX* weight, const floatX* mean, const floatX* rstd, int B, int T, int C) { - constexpr int BLOCK_SIZE = 512; - constexpr int warpsInBlock = BLOCK_SIZE / WARP_SIZE; //number of warps in block - extern __shared__ float shared[]; // size = 2 * C + 1 - + extern __shared__ float shared[]; // size = 2*C + 2*block_size + 1 + int warpsInBlock = blockDim.x / WARP_SIZE; //number of warps in block int warpId = threadIdx.x / WARP_SIZE; // warp index within a block int baseIdx = blockIdx.x * warpsInBlock + warpId; int warpThreadIdx = threadIdx.x % WARP_SIZE; // Thread index within the warp @@ -1000,14 +998,14 @@ __global__ void __launch_bounds__(512, 2) // todo - any warnings on Turing with float* dbias_shared = shared; float* dweight_shared = shared + C; float* dbias_tmp_shared = shared + 2 * C; - float* dweight_tmp_shared = shared + 2 * C + BLOCK_SIZE; + float* dweight_tmp_shared = shared + 2 * C + blockDim.x; // init shared memory to zero - for(int i = threadIdx.x; i < C; i+= BLOCK_SIZE){ + for(int i = threadIdx.x; i < C; i+= blockDim.x){ dbias_shared[i] = 0.0f; dweight_shared[i] = 0.0f; } - unsigned int *tmp_flag = (unsigned int*)(shared + 2*C + 2*BLOCK_SIZE); + unsigned int *tmp_flag = (unsigned int*)(shared + 2*C + 2*blockDim.x); __syncthreads(); for (int idx = baseIdx; idx < B * T; idx += warpsInGrid) { @@ -1102,12 +1100,14 @@ __global__ void __launch_bounds__(512, 2) // todo - any warnings on Turing with scratch += 32; float* scratch_dbias = scratch; float* scratch_dweight = scratch + C; - for(int i = threadIdx.x; i < C; i+= BLOCK_SIZE) { + for(int i = threadIdx.x; i < C; i+= blockDim.x) { // Write to global memory in the same "shared memory banking friendly" order scratch_dbias[i + 2*C*blockIdx.x] = dbias_shared[i]; scratch_dweight[i + 2*C*blockIdx.x] = dweight_shared[i]; } + // todo - everything below could become a separate kernel for better performance with maybe less code + // not enough parallelism even inside that single SM... do we need another level of reduction?! __syncthreads(); if (threadIdx.x == 0) { *tmp_flag = atomicInc(scratchFlag, gridDim.x); @@ -1115,9 +1115,7 @@ __global__ void __launch_bounds__(512, 2) // todo - any warnings on Turing with __syncthreads(); if (*tmp_flag == gridDim.x-1) { // Reduction of the partial sums by the final block - // todo - there isn't enough parallelism even inside that single SM... - // ==> so could maybe split into another kernel with YET ANOTHER level of reduction?! - for(int i = threadIdx.x * f128::size; i < C; i+= BLOCK_SIZE * f128::size) { + for(int i = threadIdx.x * f128::size; i < C; i+= blockDim.x * f128::size) { f128 dbias_accum(make_int4(0, 0, 0, 0)); f128 dweight_accum(make_int4(0, 0, 0, 0)); From edb0df967a6b3a3dfa7f7e3b440bdf3c2a4d7d7e Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Wed, 22 May 2024 11:59:09 +0000 Subject: [PATCH 137/172] continued changes for sharded dataloader --- dataloader.h | 16 +++++++-------- dev/data/fineweb.py | 50 +++++++++++++++++++++++++-------------------- train_gpt2.cu | 2 +- train_gpt2_fp32.cu | 24 +++++++++------------- 4 files changed, 47 insertions(+), 45 deletions(-) diff --git a/dataloader.h b/dataloader.h index a9864fbe7..93019317f 100644 --- a/dataloader.h +++ b/dataloader.h @@ -16,8 +16,8 @@ Implements a medium simple DataLoader for a distributed training setup. #define HEADER_SIZE 256 typedef struct { - // Distributed data parallel specifics. - // Each worker loads it's own chunk of data. + // variables related to distributed training + // each process/worker has to access different parts of the data int process_rank; int num_processes; // hyperparameters. use size_t to prevent overflow @@ -29,12 +29,11 @@ typedef struct { FILE* tokens_file; long file_size; long current_position; - // outputs - uint16_t* buffer; // used to fread data from file into + uint16_t* buffer; // we fread data from file into this buffer + // public variables that could be accessed from outside + size_t num_batches; int* inputs; // input tokens into transformer int* targets; // target tokens for the transformer - // convenience variables - size_t num_batches; } DataLoader; long dataloader_load_shard_(DataLoader *loader, int shard_index) { @@ -125,8 +124,9 @@ void dataloader_init(DataLoader *loader, assert(shard_ntok >= num_processes * B * T + 1); ntok_total += shard_ntok; } - printf("DataLoader: filename_pattern: %s\n", filename_pattern); - printf("DataLoader: Found %ld tokens across %zu shards\n", ntok_total, loader->glob_result.gl_pathc); + // debugging prints + // printf("DataLoader: filename_pattern: %s\n", filename_pattern); + // printf("DataLoader: Found %ld tokens across %zu shards\n", ntok_total, loader->glob_result.gl_pathc); // allocate all the space we'll need loader->buffer = (uint16_t*)malloc((B * T + 1) * sizeof(uint16_t)); diff --git a/dev/data/fineweb.py b/dev/data/fineweb.py index 8369112a4..9b8863dac 100644 --- a/dev/data/fineweb.py +++ b/dev/data/fineweb.py @@ -55,30 +55,36 @@ def tokenize(doc): return enc.encode_ordinary(doc["text"]) # main loop write files -pool = mp.Pool() -shard_index = 0 -all_tokens = [] -progress_bar = None -for tokens in pool.imap(tokenize, fw): +with mp.Pool() as pool: + shard_index = 0 + all_tokens = [] + progress_bar = None + for tokens in pool.imap(tokenize, fw): - # record the tokens and make sure to separate documents - all_tokens.append(eot) - all_tokens.extend(tokens) + # record the tokens and make sure to separate documents + all_tokens.append(eot) + all_tokens.extend(tokens) - # update progress bar - if progress_bar is None: - progress_bar = tqdm(total=args.shard_size, unit="tokens", desc=f"Shard {shard_index}") - progress_bar.update(len(tokens)) + # update progress bar + if progress_bar is None: + progress_bar = tqdm(total=args.shard_size, unit="tokens", desc=f"Shard {shard_index}") + progress_bar.update(len(tokens)) - # if we reach shard_size tokens, write shard to disk - if len(all_tokens) >= args.shard_size: + # if we reach shard_size tokens, write shard to disk + if len(all_tokens) >= args.shard_size: + split = "val" if shard_index == 0 else "train" + filename = os.path.join(DATA_CACHE_DIR, f"fineweb_{split}_{shard_index:06d}.bin") + write_tokens = all_tokens[:args.shard_size] + rest_tokens = all_tokens[args.shard_size:] + write_datafile(filename, write_tokens) + shard_index += 1 + progress_bar = None + # note: create a copy so Python can free the all_tokens memory above + # the list rest_tokens is expected to be very small + all_tokens = [t for t in rest_tokens] + + # write any remaining tokens as the last shard + if len(all_tokens) > 0: split = "val" if shard_index == 0 else "train" filename = os.path.join(DATA_CACHE_DIR, f"fineweb_{split}_{shard_index:06d}.bin") - write_tokens = all_tokens[:args.shard_size] - rest_tokens = all_tokens[args.shard_size:] - write_datafile(filename, write_tokens) - shard_index += 1 - progress_bar = None - # note: create a copy so Python can free the all_tokens memory above - # the list rest_tokens is expected to be very small - all_tokens = [t for t in rest_tokens] + write_datafile(filename, all_tokens) diff --git a/train_gpt2.cu b/train_gpt2.cu index 4151b22c5..578e7c80d 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -2545,7 +2545,7 @@ void error_usage() { fprintf(stderr, "Options:\n"); fprintf(stderr, " -i train data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_train.bin)\n"); fprintf(stderr, " -j val data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_val.bin)\n"); - fprintf(stderr, " -e input model filename (default = gpt2_124M_bf16.bin)\n"); + fprintf(stderr, " -e input from model at this filename (default = gpt2_124M_bf16.bin)\n"); fprintf(stderr, " -o output log file (default = NULL)\n"); fprintf(stderr, " -b (per-GPU, micro) batch size B (default = 4)\n"); fprintf(stderr, " -t sequence length T (default = 1024)\n"); diff --git a/train_gpt2_fp32.cu b/train_gpt2_fp32.cu index 9a2dc6bb7..57697bc2f 100644 --- a/train_gpt2_fp32.cu +++ b/train_gpt2_fp32.cu @@ -1525,12 +1525,10 @@ void logger_free(Logger *logger) { // CLI, poor man's argparse void error_usage() { - // default run = debugging run with TinyShakespeare - // bigger run = train on TinyStories! e.g. val/sample less often, but sample more tokens, write to logfile fprintf(stderr, "Usage: ./train_gpt2fp32cu [options]\n"); - fprintf(stderr, "Example: ./train_gpt2fp32cu -i dev/data/tinystories/TinyStories -v 100 -s 100 -g 144 -o stories.log\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -i input dataset prefix (default = data/tiny_shakespeare)\n"); + fprintf(stderr, " -i train data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_train.bin)\n"); + fprintf(stderr, " -j val data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_val.bin)\n"); fprintf(stderr, " -o output log file (default = NULL)\n"); fprintf(stderr, " -b batch size B (default = 4)\n"); fprintf(stderr, " -t sequence length T (default = 1024)\n"); @@ -1547,7 +1545,8 @@ void error_usage() { int main(int argc, char *argv[]) { // read in the (optional) command line arguments - const char* input_dataset_prefix = "dev/data/tinyshakespeare/tiny_shakespeare"; // or e.g. data/TinyStories + const char* train_data_pattern = "dev/data/tinyshakespeare/tiny_shakespeare_train.bin"; + const char* val_data_pattern = "dev/data/tinyshakespeare/tiny_shakespeare_val.bin"; const char* output_log_file = NULL; int B = 4; // batch size int T = 1024; // sequence length max @@ -1561,7 +1560,8 @@ int main(int argc, char *argv[]) { if (argv[i][0] != '-') { error_usage(); } // must start with dash if (strlen(argv[i]) != 2) { error_usage(); } // must be -x (one dash, one letter) // read in the args - if (argv[i][1] == 'i') { input_dataset_prefix = argv[i+1]; } + if (argv[i][1] == 'i') { train_data_pattern = argv[i+1]; } + else if (argv[i][1] == 'j') { val_data_pattern = argv[i+1]; } else if (argv[i][1] == 'o') { output_log_file = argv[i+1]; } else if (argv[i][1] == 'b') { B = atoi(argv[i+1]); } else if (argv[i][1] == 't') { T = atoi(argv[i+1]); } @@ -1575,7 +1575,8 @@ int main(int argc, char *argv[]) { printf("+-----------------------+----------------------------------------------------+\n"); printf("| Parameter | Value |\n"); printf("+-----------------------+----------------------------------------------------+\n"); - printf("| input dataset prefix | %-50s |\n", input_dataset_prefix); + printf("| train data pattern | %-50s |\n", train_data_pattern); + printf("| val data pattern | %-50s |\n", val_data_pattern); printf("| output log file | %-50s |\n", output_log_file == NULL ? "NULL" : output_log_file); printf("| batch size B | %-50d |\n", B); printf("| sequence length T | %-50d |\n", T); @@ -1617,14 +1618,9 @@ int main(int argc, char *argv[]) { printf("+-----------------------+----------------------------------------------------+\n"); // build DataLoaders for both train and val - char train_tokens_filename[128]; - char val_tokens_filename[128]; - assert(strlen(input_dataset_prefix) < 100); // being bit lazy here, make sure we don't overflow - sprintf(train_tokens_filename, "%s_train.bin", input_dataset_prefix); - sprintf(val_tokens_filename, "%s_val.bin", input_dataset_prefix); DataLoader train_loader, val_loader; - dataloader_init(&train_loader, train_tokens_filename, B, T, 0, 1); - dataloader_init(&val_loader, val_tokens_filename, B, T, 0, 1); + dataloader_init(&train_loader, train_data_pattern, B, T, 0, 1); + dataloader_init(&val_loader, val_data_pattern, B, T, 0, 1); int train_num_batches = train_loader.num_batches; // let's do 1 epoch by default for now int val_num_batches = train_loader.num_batches < val_max_batches ? train_loader.num_batches : val_max_batches; printf("| train_num_batches | %-50d |\n", train_num_batches); From 05be4f6d825d3b3de813e8468c21d663a5755f03 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Wed, 22 May 2024 12:32:25 +0000 Subject: [PATCH 138/172] readme changes --- README.md | 104 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 69 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index a641afda0..f77870b25 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ make train_gpt2fp32cu ./train_gpt2fp32cu ``` -The above lines (1) download the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset, tokenize it with the GPT-2 Tokenizer, (2) download and save the GPT-2 (124M) weights, (3) init from them in C/CUDA and train for one epoch on tineshakespeare with AdamW (using batch size 4, context length 1024, total of 74 steps), evaluate validation loss, and sample some text. Note that in this quickstart we are using the fp32 version [train_gpt2_fp32.cu](train_gpt2_fp32.cu) of the CUDA code. Below in the CUDA section we document the current "mainline" [train_gpt2.cu](train_gpt2.cu), which is still being very actively developed, uses mixed precision, and runs ~2X faster. +The above lines (1) download the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset, tokenize it with the GPT-2 Tokenizer, (2) download and save the GPT-2 (124M) weights, (3) init from them in C/CUDA and train for one epoch on tineshakespeare with AdamW (using batch size 4, context length 1024, total of 74 steps), evaluate validation loss, and sample some text. Note that in this quickstart we are using the fp32 version [train_gpt2_fp32.cu](train_gpt2_fp32.cu) of the CUDA code. In the next section we document the current "mainline" [train_gpt2.cu](train_gpt2.cu), which uses mixed precision, and runs ~2X faster. ## quick start (GPU, fast bleeding edge) @@ -45,75 +45,102 @@ Note that the default batch size is very low (4). If you have enough memory on y ./train_gpt2cu -b 32 ``` -My standard "prod" run with a nice GPU (e.g. A100 40GB) actually trains on TinyStories instead of TinyShakespeare, and looks like this: +My standard single-GPU "prod" run (e.g. with a A100 40GB) trains on TinyStories instead of TinyShakespeare and looks like this, as an example: ```bash python dev/data/tinystories.py make train_gpt2cu USE_CUDNN=1 -./train_gpt2cu -i dev/data/tinystories/TinyStories -v 250 -s 250 -g 144 -o stories.log -b 32 +./train_gpt2cu -i dev/data/tinystories/TinyStories_train.bin \ + -j dev/data/tinystories/TinyStories_val.bin \ + -v 250 -s 250 -g 144 -o stories.log -b 32 ``` -Where I decrease the frequency of validation loss and sampling to every 250 steps, sample 144 tokens during sampling stage (to fit ~one story), and at batch size 32. +The `-i` flag is a glob pattern for the input data, `-j` for the val data. In addition I decrease the frequency of validation loss and sampling to every 250 steps, sample 144 tokens during sampling stage (to fit ~one story), and at batch size 32. -## quick start (CPU) - -The "I am so GPU poor that I don't even have one" section. No worries, run: +If you want to train on actual, real pretraining data, check out the recently added support for [fineweb dataset](https://huggingface.co/datasets/HuggingFaceFW/fineweb). Unlike the datasets above where the train/val tokens fit into a single .bin file, we now have multiple data shards as well. Here is an example: -```bash -pip install -r requirements.txt -python dev/data/tinyshakespeare.py -python train_gpt2.py -make train_gpt2 -OMP_NUM_THREADS=8 ./train_gpt2 +``` +# write fineweb data in 100M token shards to dev/data/fineweb10B +python dev/data/fineweb.py -s 100000000 +# compile and run +./train_gpt2cu -i "dev/data/fineweb10B/fineweb_train_*.bin" \ + -j "dev/data/fineweb10B/fineweb_val_*.bin" \ + -v 250 -s 250 -g 144 -o fineweb.log -b 32 ``` -The above lines (1) download the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset, tokenize it with the GPT-2 Tokenizer, (2) download and save the GPT-2 (124M) weights, (3) init from them in C and train for 40 steps on tineshakespeare with AdamW (using batch size 4, context length only 64), evaluate validation loss, and sample some text. Honestly, unless you have a beefy CPU (and can crank up the number of OMP threads in the launch command), you're not going to get that far on CPU training LLMs, but it might be a good demo/reference. +Where you will notice the use of glob pattern `*` to match all the train shards. ## quick start (multiple GPUs) -You'll be using the (more bleeding edge) mixed precision version of the code: +Great, let's get even more serious. We're using MPI and NCCL for multi-GPU training. Everything in the section above applies, with the following changes: ```bash +# example to install MPI: sudo apt install openmpi-bin openmpi-doc libopenmpi-dev +# the run command is now preceeded by `mpirun`: +mpirun -np ./train_gpt2cu +``` + +Sub in the number of GPUs you'd like to run on in the last command. All of the flags discussed in the section above apply here as well. + +## quick start (CPU) + +The "I am so GPU poor that I don't even have one" section. You can still train! But you won't go too far. You can still finetune a GPT-2 small (124M parameter model) to output Shakespeare-like text, as an example: + +```bash pip install -r requirements.txt python dev/data/tinyshakespeare.py python train_gpt2.py -make train_gpt2cu -mpirun -np ./train_gpt2cu +make train_gpt2 +OMP_NUM_THREADS=8 ./train_gpt2 ``` -Sub in the number of GPUs you'd like to run on in the last command. +The above lines (1) download the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset, tokenize it with the GPT-2 Tokenizer, (2) download and save the GPT-2 (124M) weights, (3) init from them in C and train for 40 steps on tineshakespeare with AdamW (using batch size 4, context length only 64), evaluate validation loss, and sample some text. Honestly, unless you have a beefy CPU (and can crank up the number of OMP threads in the launch command), you're not going to get that far on CPU training LLMs, but it might be a good demo/reference. ## training: more detail -Download and tokenize a dataset. The [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset is the fastest to download and tokenize: +The data files inside `/dev/data/(dataset).py` are responsible for downloading, tokenizing and saving the tokens to file. So for example when you run: ```bash python dev/data/tinyshakespeare.py ``` -This prints: +We download and tokenize the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset. The output of this looks like this: ``` -Saved 32768 tokens to (...)/tiny_shakespeare_val.bin -Saved 305260 tokens to (...)/tiny_shakespeare_train.bin +writing 32,768 tokens to ./dev/data/tinyshakespeare/tiny_shakespeare_val.bin +writing 305,260 tokens to ./dev/data/tinyshakespeare/tiny_shakespeare_train.bin ``` -The .bin files are raw byte streams of int32 numbers indicating the token ids with the GPT-2 tokenizer. Alternatively you could also tokenize the [TinyStories](https://huggingface.co/datasets/roneneldan/TinyStories) dataset with `tinystories.py`. +The .bin files contain a short header (1024 bytes) and then a stream of tokens in uint16, indicating the token ids with the GPT-2 tokenizer. More datasets are available in `/dev/data`. -In principle we'd be ready to train the model right here. However the baseline CPU/fp32 reference code is so inefficient that it's not practical to train these models from scratch yet. Instead, we initialize with the GPT-2 weights released by OpenAI and just do finetuning. For that, we have to download the GPT-2 weights and save them as a checkpoint we can load in C: +In principle, once we have the tokens, we'd be ready to train the model right here. However, current code can't start training from scratch just yet (coming very soon), so we initialize training from the pretrained models released by OpenAI and do finetuning. For that, we have to download the GPT-2 weights and save them as a checkpoint we can load in C. This is what happens when you run this script: ```bash python train_gpt2.py ``` -You'll recognize this code from nanoGPT as a simple GPT-2 reference implementation in PyTorch. This script will download the GPT-2 (124M) model, overfit a single batch of data for 10 iterations, run a few steps of generation, and most importantly it will save three files: 1) the `gpt2_124M.bin` file that contains the raw model weights for loading in C, 2) the `gpt2_124M_debug_state.bin`, which also contains more debug state: the inputs, targets, logits and loss (useful for debugging and unit testing), and finally 3) the `gpt2_tokenizer.bin` which stores the vocabulary for the GPT-2 tokenizer, translating token ids to byte sequences of UTF-8 encoded string pieces. We can now initialize with these model weights and continue training in raw C. First compile the code: +You'll recognize this code from nanoGPT as a simple GPT-2 reference implementation in PyTorch. This script will download the GPT-2 (124M) model, overfit a single batch of data for 10 iterations, run a few steps of generation, and most importantly it will save three files: 1) the `gpt2_124M.bin` file that contains the raw model weights for loading in C, 2) the `gpt2_124M_debug_state.bin`, which also contains more debug state: the inputs, targets, logits and loss (useful for debugging and unit testing), and finally 3) the `gpt2_tokenizer.bin` which stores the vocabulary for the GPT-2 tokenizer, translating token ids to byte sequences of UTF-8 encoded string pieces. The file also saves both the fp32 versions of the above, and the bfloat16 versions of them for mixed precision training. We can now initialize with these model weights and continue training in raw C. Then we compile the training programs with `make`. There are currently three parallel implementations: ```bash +# the simple, CPU, reference code version make train_gpt2 +# the single-GPU fp32 CUDA version +make train_gpt2fp32cu +# the multi-GPU mixed precision CUDA version +make train_gpt2cu ``` -You can have a look inside the `Makefile` and its comments. It will try to autodetect if OpenMP is available on your system, which is very helpful for speeding up the code at very low cost of code complexity. Some people seem to experience problems compiling on Ubuntu, have a look at [Issue 19](https://github.com/karpathy/llm.c/issues/19), TLDR you'd want to modify the `CFLAGS`: +You can have a look inside the `Makefile` and its comments. It will try to autodetect a lot of tools and libraries (e.g. cuDNN, OpenMP, OpenMPI, nvcc), and you want to get as many checkmarks as possible. For example when I run `make train_gpt2cu USE_CUDNN=1` on my fully configured machine, we see: + +``` +✓ cuDNN found, will run with flash-attention +✓ OpenMP found +✓ OpenMPI found, OK to train with multiple GPUs +✓ nvcc found, including GPU/CUDA support +``` + +Some people seem to experience problems compiling on Ubuntu, have a look at [Issue 19](https://github.com/karpathy/llm.c/issues/19), TLDR you'd want to modify the `CFLAGS`: ``` # try this first @@ -122,7 +149,7 @@ CFLAGS="-Ofast -fno-finite-math-only -Wno-unused-result -march=native" make trai CFLAGS="-O3 -Wno-unused-result -march=native" make train_gpt2 ``` -Once `train_gpt2` is compiled, you can run it: +Once the binary is compiled, we can run it. For example the simplest CPU reference version runs as: ```bash OMP_NUM_THREADS=8 ./train_gpt2 @@ -164,18 +191,27 @@ Allay --- ``` -I like how Netflix comes up, it's clear that the shadow of the training past is still lurking in the model. I did not attempt to tune the finetuning hyperparameters so it's quite likely this can be improved quite a bit. I also noticed that slightly different platforms (e.g. MacOS / Linux) will (sadly) give very slightly different results, so perhaps don't expect to get the exact numbers or generation above. Also note that if you are seeing token ids instead of text in the generation, it might be because your code is out of date, as Tokenizer decoding was added April 14, 2024. `git pull` the updates, and then re-run `python train_gpt2.py`, which will now also save the tokenizer, which C can read and then use to print text instead of token ids. +I like how Netflix comes up, it's clear that the shadow of the training past is still lurking in the model. I did not attempt to tune the finetuning hyperparameters so it's quite likely this can be improved quite a bit. I also noticed that slightly different platforms (e.g. MacOS / Linux) will (sadly) give very slightly different results, so perhaps don't expect to get the exact numbers or generation above. + +Finally, the code is in flux. If anything weird happens that you didn't expect or that worked previously, try to `git pull`, re-run all the commands above, reference back to this README, etc. ## test -I am also attaching a simple unit test for making sure our C code agrees with the PyTorch code. Compile and run with: +I am also attaching a simple unit test for making sure our C code agrees with the PyTorch code. On the CPU as an example, compile and run with: ```bash make test_gpt2 ./test_gpt2 ``` -This now loads the `gpt2_124M_debug_state.bin` file, runs a forward pass, compares the logits and loss with the PyTorch reference implementation, then it does 10 iterations of training with Adam and makes sure the losses match PyTorch. +This now loads the `gpt2_124M_debug_state.bin` file, runs a forward pass, compares the logits and loss with the PyTorch reference implementation, then it does 10 iterations of training with Adam and makes sure the losses match PyTorch. To test the GPU version I run: + +```bash +# fp32 test (cudnn not supported) +make test_gpt2cu PRECISION=FP32 && ./test_gpt2cu +# mixed precision cudnn test +make test_gpt2cu USE_CUDNN=1 && ./test_gpt2cu +``` ## tutorial @@ -183,7 +219,7 @@ I attached a very small tutorial here, in [doc/layernorm/layernorm.md](doc/layer ## CUDA -The full training loop is also implemented in pure CUDA in one file, but optimizations of the kernels are ongoing. Currently, we roughly match the speed of PyTorch. The way we organize code is that we have a growing collection of kernels of increasing complexity in the `dev/cuda` folder, see [dev/cuda/README.md](dev/cuda/README.md). We then copy paste the best kernels into the main training loop in the single training file `train_gpt2cu.cu`. +The full training loop is also implemented in pure CUDA in one file, but optimizations of the kernels are ongoing. Currently, we slightly exceed the speed of PyTorch Nightly. The way we organize code is that we have a growing collection of kernels of increasing complexity in the `dev/cuda` folder, see [dev/cuda/README.md](dev/cuda/README.md). We then copy paste the best kernels into the main training loop in the single training file `train_gpt2cu.cu`. **WIP alert, April 23**. We merged the first version of mixed precision training code. I checkpointed the fp32 version to separate files that include `_fp32` in their filename, and would like to preserve this version in the root of the repo because it 1) doesn't require the most up to date CUDA and will a lot more likely compile and is more portable, 2) it is a lot simpler and acts as reference. In fact, we'd like to diverge the fp32 version in the direction of being pure CUDA (e.g. do not even call cuBLAS by default), to be used as an educational reference, maybe even a kernel of a course on CUDA. The "mainline" development concerned with speed will from there on move to the [train_gpt2.cu](train_gpt2.cu) file, which includes mixed precision training. @@ -198,7 +234,7 @@ make test_gpt2fp32cu This prints `overall okay: 1`. So the forward activations, backward gradients, and the individual loss values for 10 iterations all match exactly. -**Training**. To train GPT-2 in a single file of CUDA, run the train script: +**Training**. To train on single GPU in fp32: ```bash make train_gpt2fp32cu @@ -228,9 +264,7 @@ For on his rock shall he be opencast. Keep on with me, my ``` -This runs on my A100 in about ~10 seconds. This training loop in the PyTorch script is about 80ms/iteration, so we are slightly better than PyTorch here. However, this is measured with PyTorch that is a bit stale (I'm on 2.1.0) and we're not yet including FlashAttention or the PyTorch scaled_dot_product_attention fused operation. - -We can compare to naive PyTorch like this, where we turn on `torch.compile` and the use of TensorCores, which use tf32 type: +This runs on my A100 in about ~10 seconds. We can compare to naive PyTorch like this, where we turn on `torch.compile` and the use of TensorCores, which use tf32 type: ```bash python train_gpt2.py --write_tensors 0 --sequence_length 1024 --batch_size 4 --compile 1 --tensorcores 1 From 099d30f8140eca899d07e00c4708c69bc36b9261 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Wed, 22 May 2024 13:08:18 +0000 Subject: [PATCH 139/172] add a super small crappy glob for windows that only matches a single unique file. this will make CI happy but we can't train on sharded data on windows until this is improved --- dataloader.h | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/dataloader.h b/dataloader.h index 93019317f..d04fc03ed 100644 --- a/dataloader.h +++ b/dataloader.h @@ -2,15 +2,44 @@ Implements a medium simple DataLoader for a distributed training setup. */ -#include #include #include #include #include +#include +#include // defines: fopenCheck, freadCheck, fcloseCheck, fseekCheck // defines: mallocCheck #include "utils.h" +// ---------------------------------------------------------------------------- +// we need glob to list files matching a pattern +// windows does not have glob, so we fall back on a very simple implementation +// this implementation doesn't actually do a glob, it assumes that the "pattern" +// is exactly the single file of interest +#ifndef _WIN32 +#include +#else + +typedef struct glob_t { + size_t gl_pathc; + char **gl_pathv; +} glob_t; + +int glob(const char *pattern, int flags, void *unused, glob_t *pglob) { + assert(strstr(pattern, "*") == NULL); // we don't support * here + pglob->gl_pathc = 1; + pglob->gl_pathv = (char **)malloc(sizeof(char *)); + if (pglob->gl_pathv == NULL) { exit(EXIT_FAILURE); } // ??? oom? + pglob->gl_pathv[0] = (char *)pattern; + return 0; +} + +void globfree(glob_t* pglob) { + free(pglob->gl_pathv); +} +#endif + // ---------------------------------------------------------------------------- // Distributed Data Loader #define HEADER_SIZE 256 From 051f3ca53c4e0541a2a81ba09f7c0f96771ad9d6 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Wed, 22 May 2024 19:24:20 +0000 Subject: [PATCH 140/172] first draft, apparently this works. needs cleanups, and also we are not yet utilizing the full batch dimension. we actually have to load in multiple examples and fully utilize batch --- .gitignore | 2 +- dataloader.h | 170 ++++++++++++++++++++++++++++++++++++++++ dev/data/data_common.py | 61 ++++++++++++++ dev/data/hellaswag.py | 33 ++++++-- train_gpt2.cu | 63 +++++++++++++-- 5 files changed, 317 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index 05391b6d1..4f6c4a0c7 100644 --- a/.gitignore +++ b/.gitignore @@ -7,7 +7,7 @@ # data directories dev/data/__pycache__/ -dev/data/fineweb/ +dev/data/fineweb10B/ dev/data/hellaswag/ dev/data/mmlu/ dev/data/tinyshakespeare/ diff --git a/dataloader.h b/dataloader.h index d04fc03ed..72055d8db 100644 --- a/dataloader.h +++ b/dataloader.h @@ -195,3 +195,173 @@ void dataloader_free(DataLoader *loader) { fcloseCheck(loader->tokens_file); globfree(&loader->glob_result); } + +// ---------------------------------------------------------------------------- +// Distributed Eval Loader +// Many evals (like) HellaSwag and MMLU are multiple-choice +// where there are 4 possible continuations and a label for the correct one +// We want to load and serve these style of evals +/* +Copy pasting the section on the eval datafile format, from data_common.py: +- First comes a header with 256 int32s +- The examples follow, each example is a stream of uint16_t: + - delimiter of 2**16-1, i.e. 65,535 + - , bytes encoding this example, allowing efficient skip to next + - , the index of the example in the dataset + -