diff --git a/.gitignore b/.gitignore index 05391b6d1..4f6c4a0c7 100644 --- a/.gitignore +++ b/.gitignore @@ -7,7 +7,7 @@ # data directories dev/data/__pycache__/ -dev/data/fineweb/ +dev/data/fineweb10B/ dev/data/hellaswag/ dev/data/mmlu/ dev/data/tinyshakespeare/ diff --git a/dataloader.h b/dataloader.h index d04fc03ed..72055d8db 100644 --- a/dataloader.h +++ b/dataloader.h @@ -195,3 +195,173 @@ void dataloader_free(DataLoader *loader) { fcloseCheck(loader->tokens_file); globfree(&loader->glob_result); } + +// ---------------------------------------------------------------------------- +// Distributed Eval Loader +// Many evals (like) HellaSwag and MMLU are multiple-choice +// where there are 4 possible continuations and a label for the correct one +// We want to load and serve these style of evals +/* +Copy pasting the section on the eval datafile format, from data_common.py: +- First comes a header with 256 int32s +- The examples follow, each example is a stream of uint16_t: + - delimiter of 2**16-1, i.e. 65,535 + - , bytes encoding this example, allowing efficient skip to next + - , the index of the example in the dataset + -