From 051f3ca53c4e0541a2a81ba09f7c0f96771ad9d6 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Wed, 22 May 2024 19:24:20 +0000 Subject: [PATCH] first draft, apparently this works. needs cleanups, and also we are not yet utilizing the full batch dimension. we actually have to load in multiple examples and fully utilize batch --- .gitignore | 2 +- dataloader.h | 170 ++++++++++++++++++++++++++++++++++++++++ dev/data/data_common.py | 61 ++++++++++++++ dev/data/hellaswag.py | 33 ++++++-- train_gpt2.cu | 63 +++++++++++++-- 5 files changed, 317 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index 05391b6d1..4f6c4a0c7 100644 --- a/.gitignore +++ b/.gitignore @@ -7,7 +7,7 @@ # data directories dev/data/__pycache__/ -dev/data/fineweb/ +dev/data/fineweb10B/ dev/data/hellaswag/ dev/data/mmlu/ dev/data/tinyshakespeare/ diff --git a/dataloader.h b/dataloader.h index d04fc03ed..72055d8db 100644 --- a/dataloader.h +++ b/dataloader.h @@ -195,3 +195,173 @@ void dataloader_free(DataLoader *loader) { fcloseCheck(loader->tokens_file); globfree(&loader->glob_result); } + +// ---------------------------------------------------------------------------- +// Distributed Eval Loader +// Many evals (like) HellaSwag and MMLU are multiple-choice +// where there are 4 possible continuations and a label for the correct one +// We want to load and serve these style of evals +/* +Copy pasting the section on the eval datafile format, from data_common.py: +- First comes a header with 256 int32s +- The examples follow, each example is a stream of uint16_t: + - delimiter of 2**16-1, i.e. 65,535 + - , bytes encoding this example, allowing efficient skip to next + - , the index of the example in the dataset + -