Skip to content

Commit

Permalink
don't hardcode the EOT token and store it in the Tokenizer file
Browse files Browse the repository at this point in the history
  • Loading branch information
karpathy committed Apr 29, 2024
1 parent 699c254 commit b4c346a
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 6 deletions.
13 changes: 12 additions & 1 deletion tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ typedef struct {
uint32_t vocab_size;
char **token_table;
int init_ok;
int eot_token; // <|endoftext|> token id
} Tokenizer;

void safe_printf(const char *piece) {
Expand Down Expand Up @@ -53,8 +54,18 @@ void tokenizer_init(Tokenizer *tokenizer, const char *filename) {
uint32_t header[256];
freadCheck(header, sizeof(uint32_t), 256, file);
assert(header[0] == 20240328);
assert(header[1] == 1);
int version = header[1];
tokenizer->vocab_size = header[2];
if (version == 1) {
// version 1 didn't include the EOT token id
// so we assume it is 50256, the EOT in GPT-2
tokenizer->eot_token = 50256;
} else if (version == 2) {
tokenizer->eot_token = header[3];
} else {
fprintf(stderr, "Tokenizer model file %s has bad version: %d\n", filename, version);
exit(EXIT_FAILURE);
}
// read in all the tokens
unsigned char length;
tokenizer->token_table = (char **)mallocCheck(tokenizer->vocab_size * sizeof(char *));
Expand Down
7 changes: 3 additions & 4 deletions train_gpt2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2005,8 +2005,6 @@ void dataloader_free(DataLoader *loader) {
// ----------------------------------------------------------------------------
// sampler: takes probabilities and samples integers from them

#define GPT2_EOT 50256

int sample_softmax(const float* logits, int n, float coin) {
// sample index from logits (converted to probabilities using softmax)
// coin is a random number in [0, 1), usually from random_f32()
Expand Down Expand Up @@ -2222,9 +2220,10 @@ int main(int argc, char *argv[]) {

// once in a while do model inference to print generated text
if (multi_gpu_config.process_rank == 0 && (step > 0 && (step % sample_every) == 0 || last_step)) {
// fill up gen_tokens with the GPT2_EOT, which kicks off the generation
// fill up gen_tokens with the <|endoftext|> token, which kicks off the generation
int eot_token = tokenizer.eot_token;
for(int i = 0; i < B * T; ++i) {
gen_tokens[i] = GPT2_EOT;
gen_tokens[i] = eot_token;
}
// now sample from the model autoregressively
printf("generating:\n---\n");
Expand Down
3 changes: 2 additions & 1 deletion train_gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,8 +351,9 @@ def write_tokenizer(enc, filename):
n = enc.max_token_value + 1
header = torch.zeros(256, dtype=torch.int32)
header[0] = 20240328 # magic
header[1] = 1 # tokenizer version = 1
header[1] = 2 # tokenizer version = 2 (1 -> 2: includes EOT token)
header[2] = n # number of tokens
header[3] = enc.eot_token # EOT token
with open(filename, "wb") as file:
file.write(header.numpy().tobytes())
for i in range(n):
Expand Down

0 comments on commit b4c346a

Please sign in to comment.