forked from karpathy/llm.c
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
103 additions
and
86 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
/* | ||
Defines the GPT-2 Tokenizer. | ||
Only supports decoding, i.e.: tokens (integers) -> strings | ||
This is all we need for unconditional generation. | ||
If we wanted to later prompt the model, we'd have to add decoding. | ||
Which could be tricky in C because of the regex involved, to look into later. | ||
*/ | ||
|
||
#include <stdint.h> | ||
#include <ctype.h> | ||
#include <assert.h> | ||
// our own utilities | ||
// defines fopenCheck, freadCheck, fcloseCheck, fseekCheck, mallocCheck | ||
#include "utils.h" | ||
|
||
// ---------------------------------------------------------------------------- | ||
|
||
typedef struct { | ||
uint32_t vocab_size; | ||
char **token_table; | ||
int init_ok; | ||
} Tokenizer; | ||
|
||
void safe_printf(const char *piece) { | ||
// the tokens are raw bytes, and we we only want to print the printable ones | ||
// many bytes can be various control codes, backspace, etc. | ||
if (piece == NULL) { return; } | ||
if (piece[0] == '\0') { return; } | ||
// handle individual byte tokens | ||
// every token is asserted to be at least one byte so doing piece[1] is ok | ||
if (piece[1] == '\0') { | ||
unsigned char byte_val = piece[0]; | ||
if (!(isprint(byte_val) || isspace(byte_val))) { | ||
return; // weird byte, don't print it | ||
} | ||
} | ||
printf("%s", piece); | ||
} | ||
|
||
void tokenizer_init(Tokenizer *tokenizer, const char *filename) { | ||
FILE *file = fopen(filename, "rb"); | ||
if (file == NULL) { | ||
// try to be more helpful as we just added this feature, erase later | ||
printf("---\n"); | ||
printf("WARNING: Failed to open the tokenizer file %s\n", filename); | ||
printf("The Tokenizer is a new feature added April 14 2024.\n"); | ||
printf("Re-run `python train_gpt2.py` to write it\n"); | ||
printf("---\n"); | ||
tokenizer->init_ok = 0; | ||
return; | ||
} | ||
// read in the header | ||
uint32_t header[256]; | ||
freadCheck(header, sizeof(uint32_t), 256, file); | ||
assert(header[0] == 20240328); | ||
assert(header[1] == 1); | ||
tokenizer->vocab_size = header[2]; | ||
// read in all the tokens | ||
unsigned char length; | ||
tokenizer->token_table = (char **)mallocCheck(tokenizer->vocab_size * sizeof(char *)); | ||
for (uint32_t i = 0; i < tokenizer->vocab_size; i++) { | ||
freadCheck(&length, sizeof(unsigned char), 1, file); | ||
assert(length > 0); // every token should be at least one character | ||
char *token_bytes = (char *)mallocCheck(length + 1); | ||
freadCheck(token_bytes, sizeof(char), length, file); | ||
token_bytes[length] = '\0'; // Add null terminator for printing | ||
tokenizer->token_table[i] = token_bytes; | ||
} | ||
// cleanups | ||
fcloseCheck(file); | ||
tokenizer->init_ok = 1; | ||
} | ||
|
||
const char *tokenizer_decode(Tokenizer *tokenizer, uint32_t token_id) { | ||
if (tokenizer->init_ok == 0) { | ||
return NULL; | ||
} | ||
if (token_id < tokenizer->vocab_size) { | ||
return tokenizer->token_table[token_id]; | ||
} else { | ||
printf("invalid token id %d!\n", token_id); | ||
return NULL; | ||
} | ||
} | ||
|
||
void tokenizer_free(Tokenizer *tokenizer) { | ||
if (tokenizer->init_ok) { | ||
for (uint32_t i = 0; i < tokenizer->vocab_size; i++) { | ||
free(tokenizer->token_table[i]); | ||
} | ||
free(tokenizer->token_table); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters