more changes, trying to help people out because when this merges to m…

…aster it will brick everyone's code...
staar · May 20, 2024 · f671cf9 · f671cf9
1 parent 722e5b2
commit f671cf9
Show file tree

Hide file tree

Showing 4 changed files with 11 additions and 3 deletions.
diff --git a/train_gpt2.c b/train_gpt2.c
@@ -1021,6 +1021,9 @@ void dataloader_init(DataLoader *loader, const char* filename, int B, int T) {
     loader->tokens_file = fopen(filename, "rb");
     if (loader->tokens_file == NULL) {
         printf("Error opening tokens file\n");
+        printf("--> HINT: the data directory may have moved recently from data/ to dev/data/(dataset)/");
+        printf("--> HINT: refer again to the README file and possibly re-run the dataset prepro script.");
+        printf("--> HINT: example: re-run `python dev/data/tinyshakespeare.py`");
         exit(1);
     }
 

diff --git a/train_gpt2.cu b/train_gpt2.cu
@@ -2622,7 +2622,7 @@ void error_usage() {
     fprintf(stderr, "Usage:   ./train_gpt2cu [options]\n");
     fprintf(stderr, "Example: ./train_gpt2cu -i dev/data/tinystories/TinyStories -v 100 -s 100 -g 144 -o stories.log\n");
     fprintf(stderr, "Options:\n");
-    fprintf(stderr, "  -i <string> input dataset prefix (default = data/tiny_shakespeare)\n");
+    fprintf(stderr, "  -i <string> input dataset prefix (default = dev/data/tinyshakespeare/tiny_shakespeare)\n");
     fprintf(stderr, "  -e <string> input model filename (default = gpt2_124M_bf16.bin)\n");
     fprintf(stderr, "  -o <string> output log file (default = NULL)\n");
     fprintf(stderr, "  -b <int>    (per-GPU, micro) batch size B (default = 4)\n");

diff --git a/train_gpt2.py b/train_gpt2.py
@@ -495,7 +495,11 @@ def print0(*args, **kwargs):
 
     # load the tokens
     # note we're using val by default instead of train split just because it is smaller/faster
-    assert os.path.isfile(args.input_bin)
+    if not os.path.isfile(args.input_bin):
+        print0(f"ERROR: input .bin file not found: {args.input_bin}")
+        print0("---> HINT: try to re-run the data prepro script. these recently moved to dev/data")
+        print0("---> HINT: for example re-run: `python dev/data/tinyshakespeare.py`, then re-try")
+        exit(1)
     print0(f"loading cached tokens in {args.input_bin}")
     with open(args.input_bin, "rb") as f:
         tokens = np.frombuffer(f.read(), dtype=np.int32)

diff --git a/utils.h b/utils.h
@@ -24,7 +24,8 @@ FILE *fopen_check(const char *path, const char *mode, const char *file, int line
         fprintf(stderr, "  Line: %d\n", line);
         fprintf(stderr, "  Path: %s\n", path);
         fprintf(stderr, "  Mode: %s\n", mode);
-        fprintf(stderr, "---> HINT: try to re-run `python train_gpt2.py`\n");
+        fprintf(stderr, "---> HINT 1: dataset files/code have moved to dev/data recently (May 20, 2024). You may have to mv them from the legacy data/ dir to dev/data/(dataset), or re-run the data preprocessing script. Refer back to the main README\n");
+        fprintf(stderr, "---> HINT 2: possibly try to re-run `python train_gpt2.py`\n");
         exit(EXIT_FAILURE);
     }
     return fp;