separate out our Tokenizer

staar · Apr 29, 2024 · 699c254 · 699c254
1 parent 1310e2f
commit 699c254
Show file tree

Hide file tree

Showing 3 changed files with 103 additions and 86 deletions.
diff --git a/tokenizer.h b/tokenizer.h
@@ -0,0 +1,93 @@
+/*
+Defines the GPT-2 Tokenizer.
+Only supports decoding, i.e.: tokens (integers) -> strings
+This is all we need for unconditional generation.
+If we wanted to later prompt the model, we'd have to add decoding.
+Which could be tricky in C because of the regex involved, to look into later.
+*/
+
+#include <stdint.h>
+#include <ctype.h>
+#include <assert.h>
+// our own utilities
+// defines fopenCheck, freadCheck, fcloseCheck, fseekCheck, mallocCheck
+#include "utils.h"
+
+// ----------------------------------------------------------------------------
+
+typedef struct {
+    uint32_t vocab_size;
+    char **token_table;
+    int init_ok;
+} Tokenizer;
+
+void safe_printf(const char *piece) {
+    // the tokens are raw bytes, and we we only want to print the printable ones
+    // many bytes can be various control codes, backspace, etc.
+    if (piece == NULL) { return; }
+    if (piece[0] == '\0') { return; }
+    // handle individual byte tokens
+    // every token is asserted to be at least one byte so doing piece[1] is ok
+    if (piece[1] == '\0') {
+        unsigned char byte_val = piece[0];
+        if (!(isprint(byte_val) || isspace(byte_val))) {
+            return; // weird byte, don't print it
+        }
+    }
+    printf("%s", piece);
+}
+
+void tokenizer_init(Tokenizer *tokenizer, const char *filename) {
+    FILE *file = fopen(filename, "rb");
+    if (file == NULL) {
+        // try to be more helpful as we just added this feature, erase later
+        printf("---\n");
+        printf("WARNING: Failed to open the tokenizer file %s\n", filename);
+        printf("The Tokenizer is a new feature added April 14 2024.\n");
+        printf("Re-run `python train_gpt2.py` to write it\n");
+        printf("---\n");
+        tokenizer->init_ok = 0;
+        return;
+    }
+    // read in the header
+    uint32_t header[256];
+    freadCheck(header, sizeof(uint32_t), 256, file);
+    assert(header[0] == 20240328);
+    assert(header[1] == 1);
+    tokenizer->vocab_size = header[2];
+    // read in all the tokens
+    unsigned char length;
+    tokenizer->token_table = (char **)mallocCheck(tokenizer->vocab_size * sizeof(char *));
+    for (uint32_t i = 0; i < tokenizer->vocab_size; i++) {
+        freadCheck(&length, sizeof(unsigned char), 1, file);
+        assert(length > 0); // every token should be at least one character
+        char *token_bytes = (char *)mallocCheck(length + 1);
+        freadCheck(token_bytes, sizeof(char), length, file);
+        token_bytes[length] = '\0';  // Add null terminator for printing
+        tokenizer->token_table[i] = token_bytes;
+    }
+    // cleanups
+    fcloseCheck(file);
+    tokenizer->init_ok = 1;
+}
+
+const char *tokenizer_decode(Tokenizer *tokenizer, uint32_t token_id) {
+    if (tokenizer->init_ok == 0) {
+        return NULL;
+    }
+    if (token_id < tokenizer->vocab_size) {
+        return tokenizer->token_table[token_id];
+    } else {
+        printf("invalid token id %d!\n", token_id);
+        return NULL;
+    }
+}
+
+void tokenizer_free(Tokenizer *tokenizer) {
+    if (tokenizer->init_ok) {
+        for (uint32_t i = 0; i < tokenizer->vocab_size; i++) {
+            free(tokenizer->token_table[i]);
+        }
+        free(tokenizer->token_table);
+    }
+}
diff --git a/train_gpt2.cu b/train_gpt2.cu
@@ -29,7 +29,6 @@ mpirun -np 4 ./train_gpt2cu -b 8 -v 200 -s 200 -i data/TinyStories
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdarg.h>
-#include <ctype.h>
 #include <math.h>
 #include <time.h>
 #include <assert.h>
@@ -50,9 +49,10 @@ mpirun -np 4 ./train_gpt2cu -b 8 -v 200 -s 200 -i data/TinyStories
 #include <nccl.h>
 #endif
 // our own utilities
-// defines: fopenCheck, freadCheck, fcloseCheck, fseekCheck
-// defines: mallocCheck
+// defines: fopenCheck, freadCheck, fcloseCheck, fseekCheck, mallocCheck
 #include "utils.h"
+// defines: tokenizer_init, tokenizer_decode, tokenizer_free
+#include "tokenizer.h"
 
 // ----------------------------------------------------------------------------
 // CUDA precision settings
@@ -2026,86 +2026,6 @@ int sample_softmax(const float* logits, int n, float coin) {
     return n - 1; // in case of rounding errors
 }
 
-// ----------------------------------------------------------------------------
-// Tokenizer (only supports decoding: tokens (integers) -> strings)
-
-typedef struct {
-    uint32_t vocab_size;
-    char **token_table;
-    int init_ok;
-} Tokenizer;
-
-void safe_printf(const char *piece) {
-    // the tokens are raw bytes, and we we only want to print the printable ones
-    // many bytes can be various control codes, backspace, etc.
-    if (piece == NULL) { return; }
-    if (piece[0] == '\0') { return; }
-    // handle individual byte tokens
-    // every token is asserted to be at least one byte so doing piece[1] is ok
-    if (piece[1] == '\0') {
-        unsigned char byte_val = piece[0];
-        if (!(isprint(byte_val) || isspace(byte_val))) {
-            return; // weird byte, don't print it
-        }
-    }
-    printf("%s", piece);
-}
-
-void tokenizer_init(Tokenizer *tokenizer, const char *filename) {
-    FILE *file = fopen(filename, "rb");
-    if (file == NULL) {
-        // try to be more helpful as we just added this feature, erase later
-        printf("---\n");
-        printf("WARNING: Failed to open the tokenizer file %s\n", filename);
-        printf("The Tokenizer is a new feature added April 14 2024.\n");
-        printf("Re-run `python train_gpt2.py` to write it\n");
-        printf("---\n");
-        tokenizer->init_ok = 0;
-        return;
-    }
-    // read in the header
-    uint32_t header[256];
-    freadCheck(header, sizeof(uint32_t), 256, file);
-    assert(header[0] == 20240328);
-    assert(header[1] == 1);
-    tokenizer->vocab_size = header[2];
-    // read in all the tokens
-    unsigned char length;
-    tokenizer->token_table = (char **)mallocCheck(tokenizer->vocab_size * sizeof(char *));
-    for (uint32_t i = 0; i < tokenizer->vocab_size; i++) {
-        freadCheck(&length, sizeof(unsigned char), 1, file);
-        assert(length > 0); // every token should be at least one character
-        char *token_bytes = (char *)mallocCheck(length + 1);
-        freadCheck(token_bytes, sizeof(char), length, file);
-        token_bytes[length] = '\0';  // Add null terminator for printing
-        tokenizer->token_table[i] = token_bytes;
-    }
-    // cleanups
-    fcloseCheck(file);
-    tokenizer->init_ok = 1;
-}
-
-const char *tokenizer_decode(Tokenizer *tokenizer, uint32_t token_id) {
-    if (tokenizer->init_ok == 0) {
-        return NULL;
-    }
-    if (token_id < tokenizer->vocab_size) {
-        return tokenizer->token_table[token_id];
-    } else {
-        printf("invalid token id %d!\n", token_id);
-        return NULL;
-    }
-}
-
-void tokenizer_free(Tokenizer *tokenizer) {
-    if (tokenizer->init_ok) {
-        for (uint32_t i = 0; i < tokenizer->vocab_size; i++) {
-            free(tokenizer->token_table[i]);
-        }
-        free(tokenizer->token_table);
-    }
-}
-
 // ----------------------------------------------------------------------------
 // Logger lite, will probably grow/change some over time
 
@@ -2216,12 +2136,12 @@ int main(int argc, char *argv[]) {
     cublasCheck(cublasLtCreate(&cublaslt_handle));
     cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size));
 
-    // TF32 precision is equivalent to torch.set_float32_matmul_precision('high')    
+    // TF32 precision is equivalent to torch.set_float32_matmul_precision('high')
     int enable_tf32 = cuda_arch_major >= 8 ? 1 : 0;
     cublas_compute_type = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F;
     cublasMath_t cublas_math_mode = enable_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH;
     cublasCheck(cublasSetMathMode(cublas_handle, cublas_math_mode));
-    (void)cublas_compute_type; // unused in BF16 mode, avoid warning 
+    (void)cublas_compute_type; // unused in BF16 mode, avoid warning
 
     printf0("| device                | %-50s |\n", deviceProp.name);
     printf0("| TF32                  | %-50s |\n", enable_tf32 ? "enabled" : "disabled");

diff --git a/utils.h b/utils.h
@@ -4,6 +4,8 @@
  C standard library function and check its return code. If an error was reported,
  the program prints some debug information and exits.
 */
+#ifndef UTILS_H
+#define UTILS_H
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -94,4 +96,6 @@ void *malloc_check(size_t size, const char *file, int line) {
     return ptr;
 }
 
-#define mallocCheck(size) malloc_check(size, __FILE__, __LINE__)
+#define mallocCheck(size) malloc_check(size, __FILE__, __LINE__)
+
+#endif