From b4c346ac2a92686434ffe52d9835051bf219748e Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Mon, 29 Apr 2024 20:23:41 +0000
Subject: [PATCH] don't hardcode the EOT token and store it in the Tokenizer
 file

---
 tokenizer.h   | 13 ++++++++++++-
 train_gpt2.cu |  7 +++----
 train_gpt2.py |  3 ++-
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/tokenizer.h b/tokenizer.h
index aba99eb2b..5f2418b21 100644
--- a/tokenizer.h
+++ b/tokenizer.h
@@ -19,6 +19,7 @@ typedef struct {
     uint32_t vocab_size;
     char **token_table;
     int init_ok;
+    int eot_token; // <|endoftext|> token id
 } Tokenizer;
 
 void safe_printf(const char *piece) {
@@ -53,8 +54,18 @@ void tokenizer_init(Tokenizer *tokenizer, const char *filename) {
     uint32_t header[256];
     freadCheck(header, sizeof(uint32_t), 256, file);
     assert(header[0] == 20240328);
-    assert(header[1] == 1);
+    int version = header[1];
     tokenizer->vocab_size = header[2];
+    if (version == 1) {
+        // version 1 didn't include the EOT token id
+        // so we assume it is 50256, the EOT in GPT-2
+        tokenizer->eot_token = 50256;
+    } else if (version == 2) {
+        tokenizer->eot_token = header[3];
+    } else {
+        fprintf(stderr, "Tokenizer model file %s has bad version: %d\n", filename, version);
+        exit(EXIT_FAILURE);
+    }
     // read in all the tokens
     unsigned char length;
     tokenizer->token_table = (char **)mallocCheck(tokenizer->vocab_size * sizeof(char *));
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 08f6dc73d..89c23d8e7 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2005,8 +2005,6 @@ void dataloader_free(DataLoader *loader) {
 // ----------------------------------------------------------------------------
 // sampler: takes probabilities and samples integers from them
 
-#define GPT2_EOT 50256
-
 int sample_softmax(const float* logits, int n, float coin) {
     // sample index from logits (converted to probabilities using softmax)
     // coin is a random number in [0, 1), usually from random_f32()
@@ -2222,9 +2220,10 @@ int main(int argc, char *argv[]) {
 
         // once in a while do model inference to print generated text
         if (multi_gpu_config.process_rank == 0 && (step > 0 && (step % sample_every) == 0 || last_step)) {
-            // fill up gen_tokens with the GPT2_EOT, which kicks off the generation
+            // fill up gen_tokens with the <|endoftext|> token, which kicks off the generation
+            int eot_token = tokenizer.eot_token;
             for(int i = 0; i < B * T; ++i) {
-                gen_tokens[i] = GPT2_EOT;
+                gen_tokens[i] = eot_token;
             }
             // now sample from the model autoregressively
             printf("generating:\n---\n");
diff --git a/train_gpt2.py b/train_gpt2.py
index 74d56c38a..895d5eb42 100644
--- a/train_gpt2.py
+++ b/train_gpt2.py
@@ -351,8 +351,9 @@ def write_tokenizer(enc, filename):
     n = enc.max_token_value + 1
     header = torch.zeros(256, dtype=torch.int32)
     header[0] = 20240328 # magic
-    header[1] = 1 # tokenizer version = 1
+    header[1] = 2 # tokenizer version = 2 (1 -> 2: includes EOT token)
     header[2] = n # number of tokens
+    header[3] = enc.eot_token # EOT token
     with open(filename, "wb") as file:
         file.write(header.numpy().tobytes())
         for i in range(n):