From b4c346ac2a92686434ffe52d9835051bf219748e Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 29 Apr 2024 20:23:41 +0000 Subject: [PATCH] don't hardcode the EOT token and store it in the Tokenizer file --- tokenizer.h | 13 ++++++++++++- train_gpt2.cu | 7 +++---- train_gpt2.py | 3 ++- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/tokenizer.h b/tokenizer.h index aba99eb2b..5f2418b21 100644 --- a/tokenizer.h +++ b/tokenizer.h @@ -19,6 +19,7 @@ typedef struct { uint32_t vocab_size; char **token_table; int init_ok; + int eot_token; // <|endoftext|> token id } Tokenizer; void safe_printf(const char *piece) { @@ -53,8 +54,18 @@ void tokenizer_init(Tokenizer *tokenizer, const char *filename) { uint32_t header[256]; freadCheck(header, sizeof(uint32_t), 256, file); assert(header[0] == 20240328); - assert(header[1] == 1); + int version = header[1]; tokenizer->vocab_size = header[2]; + if (version == 1) { + // version 1 didn't include the EOT token id + // so we assume it is 50256, the EOT in GPT-2 + tokenizer->eot_token = 50256; + } else if (version == 2) { + tokenizer->eot_token = header[3]; + } else { + fprintf(stderr, "Tokenizer model file %s has bad version: %d\n", filename, version); + exit(EXIT_FAILURE); + } // read in all the tokens unsigned char length; tokenizer->token_table = (char **)mallocCheck(tokenizer->vocab_size * sizeof(char *)); diff --git a/train_gpt2.cu b/train_gpt2.cu index 08f6dc73d..89c23d8e7 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -2005,8 +2005,6 @@ void dataloader_free(DataLoader *loader) { // ---------------------------------------------------------------------------- // sampler: takes probabilities and samples integers from them -#define GPT2_EOT 50256 - int sample_softmax(const float* logits, int n, float coin) { // sample index from logits (converted to probabilities using softmax) // coin is a random number in [0, 1), usually from random_f32() @@ -2222,9 +2220,10 @@ int main(int argc, char *argv[]) { // once in a while do model inference to print generated text if (multi_gpu_config.process_rank == 0 && (step > 0 && (step % sample_every) == 0 || last_step)) { - // fill up gen_tokens with the GPT2_EOT, which kicks off the generation + // fill up gen_tokens with the <|endoftext|> token, which kicks off the generation + int eot_token = tokenizer.eot_token; for(int i = 0; i < B * T; ++i) { - gen_tokens[i] = GPT2_EOT; + gen_tokens[i] = eot_token; } // now sample from the model autoregressively printf("generating:\n---\n"); diff --git a/train_gpt2.py b/train_gpt2.py index 74d56c38a..895d5eb42 100644 --- a/train_gpt2.py +++ b/train_gpt2.py @@ -351,8 +351,9 @@ def write_tokenizer(enc, filename): n = enc.max_token_value + 1 header = torch.zeros(256, dtype=torch.int32) header[0] = 20240328 # magic - header[1] = 1 # tokenizer version = 1 + header[1] = 2 # tokenizer version = 2 (1 -> 2: includes EOT token) header[2] = n # number of tokens + header[3] = enc.eot_token # EOT token with open(filename, "wb") as file: file.write(header.numpy().tobytes()) for i in range(n):