Skip to content

Commit

Permalink
the full forward pass of GPT-2 in one file of pure CUDA
Browse files Browse the repository at this point in the history
  • Loading branch information
karpathy committed Apr 10, 2024
1 parent d8e2a36 commit 80f52e5
Show file tree
Hide file tree
Showing 8 changed files with 1,300 additions and 21 deletions.
14 changes: 11 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,24 @@ else
endif

# PHONY means these targets will always be executed
.PHONY: all train_gpt2 test_gpt2
.PHONY: all train_gpt2 test_gpt2 train_gpt2cu test_gpt2cu

# default target is all
all: train_gpt2 test_gpt2
all: train_gpt2 test_gpt2 train_gpt2cu test_gpt2cu

train_gpt2: train_gpt2.c
$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $< $(LDLIBS) -o $@

test_gpt2: test_gpt2.c
$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $< $(LDLIBS) -o $@

# possibly may want to disable warnings? e.g. append -Xcompiler -Wno-unused-result
train_gpt2cu: train_gpt2.cu
nvcc -O3 --use_fast_math $< -lcublas -o $@

test_gpt2cu: test_gpt2.cu
nvcc -O3 --use_fast_math $< -lcublas -o $@

clean:
rm -f train_gpt2 test_gpt2
rm -f train_gpt2 test_gpt2 train_gpt2cu test_gpt2cu

49 changes: 47 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ The generation just gives you the token ids for now, which we have to decode bac
```python
import tiktoken
enc = tiktoken.get_encoding("gpt2")
print(enc.decode(list(map(int, "50256 16773 18162 21986 11 198 13681 263 23875 198 3152 262 11773 2910 198 1169 6002 6386 2583 286 262 11858 198 20424 428 3135 7596 995 3675 13 198 40 481 407 736 17903 11 329 703 6029 706 4082 198 42826 1028 1128 633 263 11 198 10594 407 198 2704 454 680 1028 262 1027 28860 286 198 3237 323".split()))))
ptok = lambda x: print(enc.decode(list(map(int, x.strip().split()))))
ptok("50256 16773 18162 21986 11 198 13681 263 23875 198 3152 262 11773 2910 198 1169 6002 6386 2583 286 262 11858 198 20424 428 3135 7596 995 3675 13 198 40 481 407 736 17903 11 329 703 6029 706 4082 198 42826 1028 1128 633 263 11 198 10594 407 198 2704 454 680 1028 262 1027 28860 286 198 3237 323")
```

which prints:
Expand All @@ -99,7 +100,7 @@ I like how Netflix comes up, it's clear that the shadow of the training past is

I am also attaching a simple unit test for making sure our C code agrees with the PyTorch code. Compile and run with:

```
```bash
make test_gpt2
./test_gpt2
```
Expand All @@ -114,6 +115,50 @@ I attached a very small tutorial here, in [doc/layernorm/layernorm.md](doc/layer

CUDA port is WIP, I'm keeping the growing collection of kernels in the `dev` folder, e.g. see [dev/cuda/README.md](dev/cuda/README.md).

As of April 10, 2024 the full forward pass is now implemented in pure CUDA in one file. First we can check that all of the logits and the final loss matches the PyTorch reference:

```bash
make test_gpt2cu
./test_gpt2cu
```

This prints `overall okay: 1`. Now that we are calculating all the right values, we can time our code. We can't train yet because the backward pass + update are not implemented yet, but we can run the training loop and see the timings:

```bash
make train_gpt2cu
./train_gpt2cu
```

This will run GPT-2 (124M) in one file of pure CUDA (see [train_gpt2.cu](train_gpt2.cu)), using batch size 4 and sequence length 1024. This will print a bunch of hyperparameters and then the "training":

```
val loss 4.517294
step 0: train loss 4.367857 (took 112.135004 ms)
step 1: train loss 4.406483 (took 112.555327 ms)
step 2: train loss 4.484838 (took 111.380248 ms)
...
```

The loss is changing because we are still loading real data batches from our dataset, but there is no training so they won't go down over time. In any case, on my A100 40GB PCIe GPU we are seeing about 111ms/iteration. We can compare this to PyTorch fp32 training by calling our python script like this:

```bash
python train_gpt2.py --inference_only 1 --write_tensors 0 --sequence_length 1024 --batch_size 4
```

Which shows time per iteration with the same hyperparameters (batch 4, time 1024) at 180ms/iteration. We can then enable `torch.compile` by adding the `--compile 1` flag:

```bash
python train_gpt2.py --inference_only 1 --write_tensors 0 --sequence_length 1024 --batch_size 4 --compile 1
```

And see that the first iteration now takes 20 seconds (compilation time), but all following iterations take ~86ms. And if we additionally turn on the use of fp32 tensorcores (only GPUs since Volta) with `--tensorcores 1`:

```bash
python train_gpt2.py --inference_only 1 --write_tensors 0 --sequence_length 1024 --batch_size 4 --compile 1 --tensorcores 1
```

The time drops down to 26ms/iteration. So we have a gap to close :)! At the current 111ms we are about 4.2X slower.

## license

MIT
2 changes: 1 addition & 1 deletion dev/cuda/matmul_forward.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ nvcc -O3 --use_fast_math -Xcompiler -fopenmp matmul_forward.cu -o matmul_forward
version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C
OMP_NUM_THREADS=32 ./matmul_forward 1
version 2 parallelizes over all of B,T,C
version 2 calls cuBLAS, very fast
OMP_NUM_THREADS=32 ./matmul_forward 2
*/

Expand Down
6 changes: 6 additions & 0 deletions test_gpt2.c
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,12 @@ int main(int argc, char *argv[]) {

printf("overall okay: %d\n", allok);

// free everything
free(x);
free(y);
free(expected_logits);
free(expected_loss);
free(expected_grads_memory);
gpt2_free(&model);
return 0;
}
124 changes: 124 additions & 0 deletions test_gpt2.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#define TESTING
#include "train_gpt2.cu"

// poor man's tensor checker
int check_tensor(float *a, float *b, int n, char* label) {
int print_upto = 5;
int ok = 1;
printf("%s\n", label);
for (int i = 0; i < n; i++) {
if (fabs(a[i] - b[i]) <= 1e-2) {
if (i < print_upto) { printf("OK "); }
} else {
if (i < print_upto) { printf("NOT OK "); }
ok = 0;
}
if (i < print_upto) { printf("%f %f\n", a[i], b[i]); }
}
// print the final result
if (ok) {
printf("TENSOR OK\n");
} else {
printf("TENSOR NOT OK\n");
}
return ok;
}

int main(int argc, char *argv[]) {

// build the GPT-2 model from a checkpoint
GPT2 model;
gpt2_build_from_checkpoint(&model, "gpt2_124M.bin");

int C = model.config.channels;
int V = model.config.vocab_size;
int maxT = model.config.max_seq_len;
int L = model.config.num_layers;

// load additional information that we will use for debugging and error checking
FILE *state_file = fopen("gpt2_124M_debug_state.bin", "rb");
if (state_file == NULL) { printf("Error opening state file\n"); exit(1); }
int state_header[256];
fread(state_header, sizeof(int), 256, state_file);
if (state_header[0] != 20240327) { printf("Bad magic state file"); exit(1); }
if (state_header[1] != 1) { printf("Bad version in state file"); exit(1); }
int B = state_header[2]; // batch size, e.g. 4
int T = state_header[3]; // time / sequence length (e.g. 64, up to maxT)
printf("[State]\n");
printf("batch_size: %d\n", B);
printf("seq_len: %d\n", T);

ParameterTensors expected_grads;
float* expected_grads_memory = malloc_and_point_parameters(&expected_grads, model.param_sizes, 0);

// inputs and expected outputs, only used for error checking
int* x = (int*) malloc(B * T * sizeof(int));
int* y = (int*) malloc(B * T * sizeof(int));
float* expected_logits = (float*) malloc(B * T * V * sizeof(float));
float* expected_loss = (float*) malloc(1 * sizeof(float));

// read reference information from Python
fread(x, sizeof(int), B*T, state_file);
fread(y, sizeof(int), B*T, state_file);
fread(expected_logits, sizeof(float), B*T*V, state_file);
fread(expected_loss, sizeof(float), 1, state_file);
fread(expected_grads_memory, sizeof(float), model.num_parameters, state_file);
fclose(state_file);

// overall OK signal for the test
int allok = 1;

// let's do 10 training iterations, following the pytorch code
float losses[10];
for (int step = 0; step < 10; step++) {
struct timespec start, end;
clock_gettime(CLOCK_MONOTONIC, &start);
gpt2_forward(&model, x, y, B, T);
clock_gettime(CLOCK_MONOTONIC, &end);
double time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;

if (step == 0) {
// error checking at step 0 for reference activations

// at this point, target should be equal to expected_logits, let's compare
// copy logits to CPU so we can compare them
float* logits_cpu = (float*) malloc(B * T * V * sizeof(float));
cudaMemcpy(logits_cpu, model.acts.logits, B * T * V * sizeof(float), cudaMemcpyDeviceToHost);
int logits_ok = 1;
for (int i=0; i<B*T*V; i++) {
if(i < 3) {
printf("%f %f\n", expected_logits[i], logits_cpu[i]);
}
if (fabs(expected_logits[i] - logits_cpu[i]) >= 1e-2) {
printf("MISMATCH AT INDEX %d: ", i);
printf("%f %f\n", expected_logits[i],logits_cpu[i]);
logits_ok = 0;
break;
}
}
if(!logits_ok) { printf("NOT "); }
printf("OK (LOGITS)\n");
allok = allok && logits_ok;
free(logits_cpu);

// compare the achieved loss
if (fabs(model.mean_loss - *expected_loss) >= 1e-2) {
printf("LOSS MISMATCH: %f %f\n", model.mean_loss, *expected_loss);
allok = 0;
} else {
printf("LOSS OK: %f %f\n", model.mean_loss, *expected_loss);
}
}
}

printf("overall okay: %d\n", allok);

// free everything
free(x);
free(y);
free(expected_logits);
free(expected_loss);
free(expected_grads_memory);
gpt2_free(&model);
return 0;
}
2 changes: 1 addition & 1 deletion train_gpt2.c
Original file line number Diff line number Diff line change
Expand Up @@ -925,7 +925,7 @@ void gpt2_free(GPT2 *model) {
}

#ifndef TESTING
// if we are TESTING (see test.c), we'll skip the int main below
// if we are TESTING (see test_gpt2.c), we'll skip the int main below

// ----------------------------------------------------------------------------
// data loader lite
Expand Down
Loading

0 comments on commit 80f52e5

Please sign in to comment.