the full forward pass of GPT-2 in one file of pure CUDA

staar · Apr 10, 2024 · 80f52e5 · 80f52e5
1 parent d8e2a36
commit 80f52e5
Show file tree

Hide file tree

Showing 8 changed files with 1,300 additions and 21 deletions.
diff --git a/Makefile b/Makefile
@@ -34,16 +34,24 @@ else
 endif
 
 # PHONY means these targets will always be executed
-.PHONY: all train_gpt2 test_gpt2
+.PHONY: all train_gpt2 test_gpt2 train_gpt2cu test_gpt2cu
 
 # default target is all
-all: train_gpt2 test_gpt2
+all: train_gpt2 test_gpt2 train_gpt2cu test_gpt2cu
 
 train_gpt2: train_gpt2.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $< $(LDLIBS) -o $@
 
 test_gpt2: test_gpt2.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $< $(LDLIBS) -o $@
 
+# possibly may want to disable warnings? e.g. append -Xcompiler -Wno-unused-result
+train_gpt2cu: train_gpt2.cu
+	nvcc -O3 --use_fast_math $< -lcublas -o $@
+
+test_gpt2cu: test_gpt2.cu
+	nvcc -O3 --use_fast_math $< -lcublas -o $@
+
 clean:
-	rm -f train_gpt2 test_gpt2
+	rm -f train_gpt2 test_gpt2 train_gpt2cu test_gpt2cu
+
diff --git a/README.md b/README.md
@@ -75,7 +75,8 @@ The generation just gives you the token ids for now, which we have to decode bac
 ```python
 import tiktoken
 enc = tiktoken.get_encoding("gpt2")
-print(enc.decode(list(map(int, "50256 16773 18162 21986 11 198 13681 263 23875 198 3152 262 11773 2910 198 1169 6002 6386 2583 286 262 11858 198 20424 428 3135 7596 995 3675 13 198 40 481 407 736 17903 11 329 703 6029 706 4082 198 42826 1028 1128 633 263 11 198 10594 407 198 2704 454 680 1028 262 1027 28860 286 198 3237 323".split()))))
+ptok = lambda x: print(enc.decode(list(map(int, x.strip().split()))))
+ptok("50256 16773 18162 21986 11 198 13681 263 23875 198 3152 262 11773 2910 198 1169 6002 6386 2583 286 262 11858 198 20424 428 3135 7596 995 3675 13 198 40 481 407 736 17903 11 329 703 6029 706 4082 198 42826 1028 1128 633 263 11 198 10594 407 198 2704 454 680 1028 262 1027 28860 286 198 3237 323")
 ```
 
 which prints:
@@ -99,7 +100,7 @@ I like how Netflix comes up, it's clear that the shadow of the training past is
 
 I am also attaching a simple unit test for making sure our C code agrees with the PyTorch code. Compile and run with:
 
-```
+```bash
 make test_gpt2
 ./test_gpt2
 ```
@@ -114,6 +115,50 @@ I attached a very small tutorial here, in [doc/layernorm/layernorm.md](doc/layer
 
 CUDA port is WIP, I'm keeping the growing collection of kernels in the `dev` folder, e.g. see [dev/cuda/README.md](dev/cuda/README.md).
 
+As of April 10, 2024 the full forward pass is now implemented in pure CUDA in one file. First we can check that all of the logits and the final loss matches the PyTorch reference:
+
+```bash
+make test_gpt2cu
+./test_gpt2cu
+```
+
+This prints `overall okay: 1`. Now that we are calculating all the right values, we can time our code. We can't train yet because the backward pass + update are not implemented yet, but we can run the training loop and see the timings:
+
+```bash
+make train_gpt2cu
+./train_gpt2cu
+```
+
+This will run GPT-2 (124M) in one file of pure CUDA (see [train_gpt2.cu](train_gpt2.cu)), using batch size 4 and sequence length 1024. This will print a bunch of hyperparameters and then the "training":
+
+```
+val loss 4.517294
+step 0: train loss 4.367857 (took 112.135004 ms)
+step 1: train loss 4.406483 (took 112.555327 ms)
+step 2: train loss 4.484838 (took 111.380248 ms)
+...
+```
+
+The loss is changing because we are still loading real data batches from our dataset, but there is no training so they won't go down over time. In any case, on my A100 40GB PCIe GPU we are seeing about 111ms/iteration. We can compare this to PyTorch fp32 training by calling our python script like this:
+
+```bash
+python train_gpt2.py --inference_only 1 --write_tensors 0 --sequence_length 1024 --batch_size 4
+```
+
+Which shows time per iteration with the same hyperparameters (batch 4, time 1024) at 180ms/iteration. We can then enable `torch.compile` by adding the `--compile 1` flag:
+
+```bash
+python train_gpt2.py --inference_only 1 --write_tensors 0 --sequence_length 1024 --batch_size 4 --compile 1
+```
+
+And see that the first iteration now takes 20 seconds (compilation time), but all following iterations take ~86ms. And if we additionally turn on the use of fp32 tensorcores (only GPUs since Volta) with `--tensorcores 1`:
+
+```bash
+python train_gpt2.py --inference_only 1 --write_tensors 0 --sequence_length 1024 --batch_size 4 --compile 1 --tensorcores 1
+```
+
+The time drops down to 26ms/iteration. So we have a gap to close :)! At the current 111ms we are about 4.2X slower.
+
 ## license
 
 MIT
diff --git a/dev/cuda/matmul_forward.cu b/dev/cuda/matmul_forward.cu
@@ -8,7 +8,7 @@ nvcc -O3 --use_fast_math -Xcompiler -fopenmp matmul_forward.cu -o matmul_forward
 version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C
 OMP_NUM_THREADS=32 ./matmul_forward 1
 
-version 2 parallelizes over all of B,T,C
+version 2 calls cuBLAS, very fast
 OMP_NUM_THREADS=32 ./matmul_forward 2
 */
 

diff --git a/test_gpt2.c b/test_gpt2.c
@@ -166,6 +166,12 @@ int main(int argc, char *argv[]) {
 
     printf("overall okay: %d\n", allok);
 
+    // free everything
+    free(x);
+    free(y);
+    free(expected_logits);
+    free(expected_loss);
+    free(expected_grads_memory);
     gpt2_free(&model);
     return 0;
 }
diff --git a/test_gpt2.cu b/test_gpt2.cu
@@ -0,0 +1,124 @@
+#define TESTING
+#include "train_gpt2.cu"
+
+// poor man's tensor checker
+int check_tensor(float *a, float *b, int n, char* label) {
+    int print_upto = 5;
+    int ok = 1;
+    printf("%s\n", label);
+    for (int i = 0; i < n; i++) {
+        if (fabs(a[i] - b[i]) <= 1e-2) {
+            if (i < print_upto) { printf("OK "); }
+        } else {
+            if (i < print_upto) { printf("NOT OK "); }
+            ok = 0;
+        }
+        if (i < print_upto) { printf("%f %f\n", a[i], b[i]); }
+    }
+    // print the final result
+    if (ok) {
+        printf("TENSOR OK\n");
+    } else {
+        printf("TENSOR NOT OK\n");
+    }
+    return ok;
+}
+
+int main(int argc, char *argv[]) {
+
+    // build the GPT-2 model from a checkpoint
+    GPT2 model;
+    gpt2_build_from_checkpoint(&model, "gpt2_124M.bin");
+
+    int C = model.config.channels;
+    int V = model.config.vocab_size;
+    int maxT = model.config.max_seq_len;
+    int L = model.config.num_layers;
+
+    // load additional information that we will use for debugging and error checking
+    FILE *state_file = fopen("gpt2_124M_debug_state.bin", "rb");
+    if (state_file == NULL) { printf("Error opening state file\n"); exit(1); }
+    int state_header[256];
+    fread(state_header, sizeof(int), 256, state_file);
+    if (state_header[0] != 20240327) { printf("Bad magic state file"); exit(1); }
+    if (state_header[1] != 1) { printf("Bad version in state file"); exit(1); }
+    int B = state_header[2]; // batch size, e.g. 4
+    int T = state_header[3]; // time / sequence length (e.g. 64, up to maxT)
+    printf("[State]\n");
+    printf("batch_size: %d\n", B);
+    printf("seq_len: %d\n", T);
+
+    ParameterTensors expected_grads;
+    float* expected_grads_memory = malloc_and_point_parameters(&expected_grads, model.param_sizes, 0);
+
+    // inputs and expected outputs, only used for error checking
+    int* x = (int*) malloc(B * T * sizeof(int));
+    int* y = (int*) malloc(B * T * sizeof(int));
+    float* expected_logits = (float*) malloc(B * T * V * sizeof(float));
+    float* expected_loss = (float*) malloc(1 * sizeof(float));
+
+    // read reference information from Python
+    fread(x, sizeof(int), B*T, state_file);
+    fread(y, sizeof(int), B*T, state_file);
+    fread(expected_logits, sizeof(float), B*T*V, state_file);
+    fread(expected_loss, sizeof(float), 1, state_file);
+    fread(expected_grads_memory, sizeof(float), model.num_parameters, state_file);
+    fclose(state_file);
+
+    // overall OK signal for the test
+    int allok = 1;
+
+    // let's do 10 training iterations, following the pytorch code
+    float losses[10];
+    for (int step = 0; step < 10; step++) {
+        struct timespec start, end;
+        clock_gettime(CLOCK_MONOTONIC, &start);
+        gpt2_forward(&model, x, y, B, T);
+        clock_gettime(CLOCK_MONOTONIC, &end);
+        double time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
+
+        if (step == 0) {
+            // error checking at step 0 for reference activations
+
+            // at this point, target should be equal to expected_logits, let's compare
+            // copy logits to CPU so we can compare them
+            float* logits_cpu = (float*) malloc(B * T * V * sizeof(float));
+            cudaMemcpy(logits_cpu, model.acts.logits, B * T * V * sizeof(float), cudaMemcpyDeviceToHost);
+            int logits_ok = 1;
+            for (int i=0; i<B*T*V; i++) {
+                if(i < 3) {
+                    printf("%f %f\n", expected_logits[i], logits_cpu[i]);
+                }
+                if (fabs(expected_logits[i] - logits_cpu[i]) >= 1e-2) {
+                    printf("MISMATCH AT INDEX %d: ", i);
+                    printf("%f %f\n", expected_logits[i],logits_cpu[i]);
+                    logits_ok = 0;
+                    break;
+                }
+            }
+            if(!logits_ok) { printf("NOT "); }
+            printf("OK (LOGITS)\n");
+            allok = allok && logits_ok;
+            free(logits_cpu);
+
+            // compare the achieved loss
+            if (fabs(model.mean_loss - *expected_loss) >= 1e-2) {
+                printf("LOSS MISMATCH: %f %f\n", model.mean_loss, *expected_loss);
+                allok = 0;
+            } else {
+                printf("LOSS OK: %f %f\n", model.mean_loss, *expected_loss);
+            }
+        }
+    }
+
+    printf("overall okay: %d\n", allok);
+
+    // free everything
+    free(x);
+    free(y);
+    free(expected_logits);
+    free(expected_loss);
+    free(expected_grads_memory);
+    gpt2_free(&model);
+    return 0;
+}
diff --git a/train_gpt2.c b/train_gpt2.c
@@ -925,7 +925,7 @@ void gpt2_free(GPT2 *model) {
 }
 
 #ifndef TESTING
-// if we are TESTING (see test.c), we'll skip the int main below
+// if we are TESTING (see test_gpt2.c), we'll skip the int main below
 
 // ----------------------------------------------------------------------------
 // data loader lite