From 07fcb66fddd06a621fe4e0c81bcfc88015afe87a Mon Sep 17 00:00:00 2001 From: Dan Braun Date: Mon, 9 Feb 2026 09:16:26 +0000 Subject: [PATCH 1/5] Setup tokenized pile dataset --- spd/data.py | 9 +++++++-- spd/experiments/lm/pile_llama_simple_mlp-2L.yaml | 6 +++--- spd/pretrain/configs/gpt2_simple-2L.yaml | 4 ++-- .../configs/owt_llama_simple_mlp-12L-768.yaml | 4 ++-- .../configs/pile_llama_simple_mlp-12L-768.yaml | 12 ++++++------ .../configs/pile_llama_simple_mlp-2L-2048.yaml | 12 ++++++------ .../configs/pile_llama_simple_mlp-2L-768.yaml | 16 ++++++++-------- .../configs/pile_llama_simple_mlp-4L-768.yaml | 16 ++++++++-------- .../configs/ss_llama_simple_mlp-2L-128.yaml | 4 ++-- .../configs/ss_llama_simple_mlp-4L-192.yaml | 4 ++-- spd/pretrain/train.py | 14 +++----------- 11 files changed, 49 insertions(+), 52 deletions(-) diff --git a/spd/data.py b/spd/data.py index 139e773a8..ed2285f94 100644 --- a/spd/data.py +++ b/spd/data.py @@ -223,9 +223,14 @@ def create_data_loader( assert isinstance(sample, Tensor) and sample.ndim == 1, ( f"Expected the dataset to be tokenized. Got type {type(sample)}" ) - assert len(sample) == dataset_config.n_ctx, ( - f"n_ctx ({dataset_config.n_ctx}) does not match the tokenized length ({len(sample)})." + tokenized_len = len(sample) + assert dataset_config.n_ctx <= tokenized_len, ( + f"n_ctx ({dataset_config.n_ctx}) is larger than the tokenized length ({tokenized_len})." ) + if dataset_config.n_ctx < tokenized_len: + col = dataset_config.column_name + n_ctx = dataset_config.n_ctx + torch_dataset = dataset.map(lambda x: {col: x[col][:n_ctx]}).with_format("torch") else: to_lower = "SimpleStories" in dataset_config.name torch_dataset = tokenize_and_concatenate( diff --git a/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml b/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml index 3fed070fc..7b3b0d4e9 100644 --- a/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml +++ b/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml @@ -98,10 +98,10 @@ task_config: task_name: lm max_seq_len: 512 buffer_size: 1000 - dataset_name: monology/pile-uncopyrighted + dataset_name: danbraunai/pile-uncopyrighted column_name: text - train_data_split: train[:10000000] - eval_data_split: train[-100000:] + train_data_split: train + eval_data_split: val shuffle_each_epoch: true is_tokenized: false streaming: false \ No newline at end of file diff --git a/spd/pretrain/configs/gpt2_simple-2L.yaml b/spd/pretrain/configs/gpt2_simple-2L.yaml index 145663be7..5f570faca 100644 --- a/spd/pretrain/configs/gpt2_simple-2L.yaml +++ b/spd/pretrain/configs/gpt2_simple-2L.yaml @@ -27,7 +27,7 @@ train_dataset_config: hf_tokenizer_path: SimpleStories/test-SimpleStories-gpt2-1.25M split: train streaming: false - n_ctx: 512 + n_ctx: 513 seed: 0 column_name: story @@ -37,6 +37,6 @@ val_dataset_config: hf_tokenizer_path: SimpleStories/test-SimpleStories-gpt2-1.25M split: test streaming: false - n_ctx: 512 + n_ctx: 513 seed: 0 column_name: story diff --git a/spd/pretrain/configs/owt_llama_simple_mlp-12L-768.yaml b/spd/pretrain/configs/owt_llama_simple_mlp-12L-768.yaml index 038187444..969a9c938 100644 --- a/spd/pretrain/configs/owt_llama_simple_mlp-12L-768.yaml +++ b/spd/pretrain/configs/owt_llama_simple_mlp-12L-768.yaml @@ -31,7 +31,7 @@ train_dataset_config: hf_tokenizer_path: gpt2 split: train[:10000000] streaming: false - n_ctx: 512 + n_ctx: 513 seed: 0 column_name: text @@ -41,6 +41,6 @@ val_dataset_config: hf_tokenizer_path: gpt2 split: train[-100000:] streaming: false - n_ctx: 512 + n_ctx: 513 seed: 0 column_name: text diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-12L-768.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-12L-768.yaml index 6571f69c7..b0b07a9cd 100644 --- a/spd/pretrain/configs/pile_llama_simple_mlp-12L-768.yaml +++ b/spd/pretrain/configs/pile_llama_simple_mlp-12L-768.yaml @@ -26,21 +26,21 @@ model: flash_attention: false train_dataset_config: - name: monology/pile-uncopyrighted + name: danbraunai/pile-uncopyrighted is_tokenized: false hf_tokenizer_path: EleutherAI/gpt-neox-20b - split: train[:100000] # Dataset has 177M examples + split: train streaming: false - n_ctx: 512 + n_ctx: 513 seed: 0 column_name: text val_dataset_config: - name: monology/pile-uncopyrighted + name: danbraunai/pile-uncopyrighted is_tokenized: false hf_tokenizer_path: EleutherAI/gpt-neox-20b - split: train[-1000000:] + split: val streaming: false - n_ctx: 512 + n_ctx: 513 seed: 0 column_name: text diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-2L-2048.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-2L-2048.yaml index 380360f90..dc8e97a3a 100644 --- a/spd/pretrain/configs/pile_llama_simple_mlp-2L-2048.yaml +++ b/spd/pretrain/configs/pile_llama_simple_mlp-2L-2048.yaml @@ -35,21 +35,21 @@ model: vocab_size: 50277 train_dataset_config: - name: monology/pile-uncopyrighted + name: danbraunai/pile-uncopyrighted is_tokenized: false hf_tokenizer_path: EleutherAI/gpt-neox-20b - split: train[:100000] # Dataset has 177M examples + split: train streaming: false - n_ctx: 512 + n_ctx: 513 seed: 0 column_name: text val_dataset_config: - name: monology/pile-uncopyrighted + name: danbraunai/pile-uncopyrighted is_tokenized: false hf_tokenizer_path: EleutherAI/gpt-neox-20b - split: train[-100000:] + split: val streaming: false - n_ctx: 512 + n_ctx: 513 seed: 0 column_name: text diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-2L-768.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-2L-768.yaml index 1bbc636df..8df3d0699 100644 --- a/spd/pretrain/configs/pile_llama_simple_mlp-2L-768.yaml +++ b/spd/pretrain/configs/pile_llama_simple_mlp-2L-768.yaml @@ -32,21 +32,21 @@ model: vocab_size: 50277 train_dataset_config: - name: monology/pile-uncopyrighted - is_tokenized: false + name: danbraunai/pile-uncopyrighted-tok + is_tokenized: true hf_tokenizer_path: EleutherAI/gpt-neox-20b - split: train[:-100000] # Dataset has 177M examples + split: train streaming: false - n_ctx: 512 + n_ctx: 513 seed: 0 column_name: text val_dataset_config: - name: monology/pile-uncopyrighted - is_tokenized: false + name: danbraunai/pile-uncopyrighted-tok + is_tokenized: true hf_tokenizer_path: EleutherAI/gpt-neox-20b - split: train[-100000:] + split: val streaming: false - n_ctx: 512 + n_ctx: 513 seed: 0 column_name: text diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-4L-768.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-4L-768.yaml index 7b33d348a..ffbe4c143 100644 --- a/spd/pretrain/configs/pile_llama_simple_mlp-4L-768.yaml +++ b/spd/pretrain/configs/pile_llama_simple_mlp-4L-768.yaml @@ -33,21 +33,21 @@ model: vocab_size: 50277 train_dataset_config: - name: monology/pile-uncopyrighted - is_tokenized: false + name: danbraunai/pile-uncopyrighted-tok + is_tokenized: true hf_tokenizer_path: EleutherAI/gpt-neox-20b - split: train[:100000] # Dataset has 177M examples + split: train streaming: false - n_ctx: 512 + n_ctx: 513 seed: 0 column_name: text val_dataset_config: - name: monology/pile-uncopyrighted - is_tokenized: false + name: danbraunai/pile-uncopyrighted-tok + is_tokenized: true hf_tokenizer_path: EleutherAI/gpt-neox-20b - split: train[-100000:] + split: val streaming: false - n_ctx: 512 + n_ctx: 513 seed: 0 column_name: text diff --git a/spd/pretrain/configs/ss_llama_simple_mlp-2L-128.yaml b/spd/pretrain/configs/ss_llama_simple_mlp-2L-128.yaml index f3f3b5390..86ddab606 100644 --- a/spd/pretrain/configs/ss_llama_simple_mlp-2L-128.yaml +++ b/spd/pretrain/configs/ss_llama_simple_mlp-2L-128.yaml @@ -31,7 +31,7 @@ train_dataset_config: hf_tokenizer_path: SimpleStories/test-SimpleStories-gpt2-1.25M split: train streaming: false - n_ctx: 512 + n_ctx: 513 seed: 0 column_name: story @@ -41,6 +41,6 @@ val_dataset_config: hf_tokenizer_path: SimpleStories/test-SimpleStories-gpt2-1.25M split: test streaming: false - n_ctx: 512 + n_ctx: 513 seed: 0 column_name: story diff --git a/spd/pretrain/configs/ss_llama_simple_mlp-4L-192.yaml b/spd/pretrain/configs/ss_llama_simple_mlp-4L-192.yaml index b68927c00..3ee32bb1d 100644 --- a/spd/pretrain/configs/ss_llama_simple_mlp-4L-192.yaml +++ b/spd/pretrain/configs/ss_llama_simple_mlp-4L-192.yaml @@ -31,7 +31,7 @@ train_dataset_config: hf_tokenizer_path: SimpleStories/test-SimpleStories-gpt2-1.25M split: train streaming: false - n_ctx: 512 + n_ctx: 513 seed: 0 column_name: story @@ -41,6 +41,6 @@ val_dataset_config: hf_tokenizer_path: SimpleStories/test-SimpleStories-gpt2-1.25M split: test streaming: false - n_ctx: 512 + n_ctx: 513 seed: 0 column_name: story diff --git a/spd/pretrain/train.py b/spd/pretrain/train.py index bd5fb9b12..73d0d6725 100644 --- a/spd/pretrain/train.py +++ b/spd/pretrain/train.py @@ -206,15 +206,7 @@ def main(config_path_or_obj: Path | str | Config | None = None) -> None: load_dotenv(override=True) config = load_config(config_path_or_obj, config_model=Config) - T = config.train_dataset_config.n_ctx # Training sequence length (positions to train on) - - # Load n_ctx+1 tokens so we can train on n_ctx positions (need extra token for labels) - train_dataset_config = config.train_dataset_config.model_copy( - update={"n_ctx": config.train_dataset_config.n_ctx + 1} - ) - val_dataset_config = config.val_dataset_config.model_copy( - update={"n_ctx": config.val_dataset_config.n_ctx + 1} - ) + T = config.train_dataset_config.n_ctx - 1 # Training sequence length (positions to train on) # set up DDP (distributed data parallel). torchrun sets this env variable ddp = int(os.environ.get("RANK", -1)) != -1 @@ -304,7 +296,7 @@ def main(config_path_or_obj: Path | str | Config | None = None) -> None: model = cast(nn.Module, torch.compile(model)) # type: ignore[reportArgumentType] train_loader, train_tokenizer = create_data_loader( - dataset_config=train_dataset_config, + dataset_config=config.train_dataset_config, batch_size=B, buffer_size=1000, global_seed=0, @@ -313,7 +305,7 @@ def main(config_path_or_obj: Path | str | Config | None = None) -> None: train_iter = iter(train_loader) val_loader, _ = create_data_loader( - dataset_config=val_dataset_config, + dataset_config=config.val_dataset_config, batch_size=B, buffer_size=1000, global_seed=0, From 3c79f399136ce18c4e73a5f7229ff56c65c70202 Mon Sep 17 00:00:00 2001 From: Dan Braun Date: Mon, 9 Feb 2026 09:22:15 +0000 Subject: [PATCH 2/5] Update spd pile config --- spd/experiments/lm/pile_llama_simple_mlp-2L.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml b/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml index 7b3b0d4e9..ac2ebad80 100644 --- a/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml +++ b/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml @@ -93,15 +93,15 @@ pretrained_model_class: spd.pretrain.models.llama_simple_mlp.LlamaSimpleMLP pretrained_model_path: null pretrained_model_name: goodfire/spd/runs/ivdw6l06 pretrained_model_output_attr: idx_0 -tokenizer_name: gpt2 +tokenizer_name: EleutherAI/gpt-neox-20b task_config: task_name: lm max_seq_len: 512 buffer_size: 1000 - dataset_name: danbraunai/pile-uncopyrighted + dataset_name: danbraunai/pile-uncopyrighted-tok column_name: text train_data_split: train eval_data_split: val shuffle_each_epoch: true - is_tokenized: false + is_tokenized: true streaming: false \ No newline at end of file From 53838b3b94b6a166f785aef47a86b889a0e45429 Mon Sep 17 00:00:00 2001 From: Dan Braun Date: Mon, 9 Feb 2026 10:15:13 +0000 Subject: [PATCH 3/5] Update configs --- .../lm/pile_llama_simple_mlp-2L.yaml | 2 +- .../pile_llama_simple_mlp-12L-768.yaml | 13 +++--- .../configs/pile_llama_simple_mlp-1L-128.yaml | 46 +++++++++++++++++++ .../pile_llama_simple_mlp-2L-2048.yaml | 12 ++--- .../configs/pile_llama_simple_mlp-2L-768.yaml | 4 +- .../configs/pile_llama_simple_mlp-4L-768.yaml | 5 +- 6 files changed, 65 insertions(+), 17 deletions(-) create mode 100644 spd/pretrain/configs/pile_llama_simple_mlp-1L-128.yaml diff --git a/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml b/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml index ac2ebad80..b88f9b433 100644 --- a/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml +++ b/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml @@ -99,7 +99,7 @@ task_config: max_seq_len: 512 buffer_size: 1000 dataset_name: danbraunai/pile-uncopyrighted-tok - column_name: text + column_name: input_ids train_data_split: train eval_data_split: val shuffle_each_epoch: true diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-12L-768.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-12L-768.yaml index b0b07a9cd..a1a12a9cc 100644 --- a/spd/pretrain/configs/pile_llama_simple_mlp-12L-768.yaml +++ b/spd/pretrain/configs/pile_llama_simple_mlp-12L-768.yaml @@ -26,21 +26,22 @@ model: flash_attention: false train_dataset_config: - name: danbraunai/pile-uncopyrighted - is_tokenized: false + name: danbraunai/pile-uncopyrighted-tok + is_tokenized: true hf_tokenizer_path: EleutherAI/gpt-neox-20b split: train streaming: false n_ctx: 513 seed: 0 - column_name: text + column_name: input_ids val_dataset_config: - name: danbraunai/pile-uncopyrighted - is_tokenized: false + name: danbraunai/pile-uncopyrighted-tok + is_tokenized: true hf_tokenizer_path: EleutherAI/gpt-neox-20b split: val streaming: false n_ctx: 513 seed: 0 - column_name: text + column_name: input_ids + diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-1L-128.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-1L-128.yaml new file mode 100644 index 000000000..326f1f7fb --- /dev/null +++ b/spd/pretrain/configs/pile_llama_simple_mlp-1L-128.yaml @@ -0,0 +1,46 @@ +wandb_project: spd +dtype: bfloat16 +batch_size: 1024 +num_iterations: 100_000 +warmup_iters: 600 +learning_rate: 3e-4 +learning_rate_decay_frac: 0.1 +weight_decay: 0.1 +grad_clip: 1.0 +val_loss_every: 1000 +val_max_steps: 20 +sample_every: 1000 +intermediate_checkpoints: false + +model: + model_type: LlamaSimpleMLP + block_size: 512 + vocab_size: 4019 + n_layer: 2 + n_head: 4 + n_embd: 128 + n_intermediate: 512 # 128 * 4 + rotary_dim: 32 # 128 // 4 + n_ctx: 512 + n_key_value_heads: 2 + flash_attention: false + +train_dataset_config: + name: danbraunai/pile-uncopyrighted-tok + is_tokenized: true + hf_tokenizer_path: EleutherAI/gpt-neox-20b + split: train + streaming: false + n_ctx: 513 + seed: 0 + column_name: input_ids + +val_dataset_config: + name: danbraunai/pile-uncopyrighted-tok + is_tokenized: true + hf_tokenizer_path: EleutherAI/gpt-neox-20b + split: val + streaming: false + n_ctx: 513 + seed: 0 + column_name: input_ids diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-2L-2048.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-2L-2048.yaml index dc8e97a3a..d59d126b1 100644 --- a/spd/pretrain/configs/pile_llama_simple_mlp-2L-2048.yaml +++ b/spd/pretrain/configs/pile_llama_simple_mlp-2L-2048.yaml @@ -35,21 +35,21 @@ model: vocab_size: 50277 train_dataset_config: - name: danbraunai/pile-uncopyrighted - is_tokenized: false + name: danbraunai/pile-uncopyrighted-tok + is_tokenized: true hf_tokenizer_path: EleutherAI/gpt-neox-20b split: train streaming: false n_ctx: 513 seed: 0 - column_name: text + column_name: input_ids val_dataset_config: - name: danbraunai/pile-uncopyrighted - is_tokenized: false + name: danbraunai/pile-uncopyrighted-tok + is_tokenized: true hf_tokenizer_path: EleutherAI/gpt-neox-20b split: val streaming: false n_ctx: 513 seed: 0 - column_name: text + column_name: input_ids diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-2L-768.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-2L-768.yaml index 8df3d0699..3c9c14ef5 100644 --- a/spd/pretrain/configs/pile_llama_simple_mlp-2L-768.yaml +++ b/spd/pretrain/configs/pile_llama_simple_mlp-2L-768.yaml @@ -39,7 +39,7 @@ train_dataset_config: streaming: false n_ctx: 513 seed: 0 - column_name: text + column_name: input_ids val_dataset_config: name: danbraunai/pile-uncopyrighted-tok @@ -49,4 +49,4 @@ val_dataset_config: streaming: false n_ctx: 513 seed: 0 - column_name: text + column_name: input_ids diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-4L-768.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-4L-768.yaml index ffbe4c143..315d1309a 100644 --- a/spd/pretrain/configs/pile_llama_simple_mlp-4L-768.yaml +++ b/spd/pretrain/configs/pile_llama_simple_mlp-4L-768.yaml @@ -40,7 +40,7 @@ train_dataset_config: streaming: false n_ctx: 513 seed: 0 - column_name: text + column_name: input_ids val_dataset_config: name: danbraunai/pile-uncopyrighted-tok @@ -50,4 +50,5 @@ val_dataset_config: streaming: false n_ctx: 513 seed: 0 - column_name: text + column_name: input_ids + From c4eddcddd18837ee4608f6275cdc9c7e6dcbd7ef Mon Sep 17 00:00:00 2001 From: Dan Braun Date: Mon, 9 Feb 2026 13:56:07 +0000 Subject: [PATCH 4/5] Update docs for datset n_ctx --- spd/data.py | 15 ++++++++------- spd/pretrain/CLAUDE.md | 7 +++++++ spd/pretrain/configs/gpt2_simple-2L.yaml | 4 ++-- .../configs/owt_llama_simple_mlp-12L-768.yaml | 4 ++-- .../configs/pile_llama_simple_mlp-12L-768.yaml | 4 ++-- .../configs/pile_llama_simple_mlp-1L-128.yaml | 4 ++-- .../configs/pile_llama_simple_mlp-2L-2048.yaml | 4 ++-- .../configs/pile_llama_simple_mlp-2L-768.yaml | 4 ++-- .../configs/pile_llama_simple_mlp-4L-768.yaml | 4 ++-- .../configs/ss_llama_simple_mlp-2L-128.yaml | 4 ++-- .../configs/ss_llama_simple_mlp-4L-192.yaml | 4 ++-- 11 files changed, 33 insertions(+), 25 deletions(-) diff --git a/spd/data.py b/spd/data.py index ed2285f94..f55703811 100644 --- a/spd/data.py +++ b/spd/data.py @@ -16,14 +16,15 @@ class DatasetConfig(BaseConfig): - name: str = "lennart-finke/SimpleStories" - is_tokenized: bool = True - hf_tokenizer_path: str | None = None - streaming: bool = False - split: str = "train" - n_ctx: int = 1024 + name: str + is_tokenized: bool + hf_tokenizer_path: str | None + streaming: bool + split: str + n_ctx: int + """Must be model n_ctx + 1 to provide room for next-token label indexing.""" seed: int | None = None - column_name: str = "input_ids" + column_name: str """The name of the column in the dataset that contains the data (tokenized or non-tokenized). Typically 'input_ids' for datasets stored with e2e_sae/scripts/upload_hf_dataset.py, or "tokens" for datasets tokenized in TransformerLens (e.g. NeelNanda/pile-10k).""" diff --git a/spd/pretrain/CLAUDE.md b/spd/pretrain/CLAUDE.md index 568bc4ad0..92a02c60a 100644 --- a/spd/pretrain/CLAUDE.md +++ b/spd/pretrain/CLAUDE.md @@ -38,6 +38,13 @@ spd-pretrain --config_path ... --n_gpus 4 - **SimpleStories**: `SimpleStories/test-SimpleStories-gpt2-1.25M` (vocab size: 4019) - **Pile/OpenWebText**: `gpt2` (vocab size: 50257) +## Dataset n_ctx vs Model n_ctx + +The dataset `n_ctx` must be **model n_ctx + 1**. During training, sequences are split into +input `[:, :-1]` and target `[:, 1:]` for next-token prediction, so the extra token provides +room for label indexing. For example, if the model has `n_ctx: 512`, the dataset should have +`n_ctx: 513`. + ## Key Files - `train.py` - Main training loop with DDP support diff --git a/spd/pretrain/configs/gpt2_simple-2L.yaml b/spd/pretrain/configs/gpt2_simple-2L.yaml index 5f570faca..8bc45aafc 100644 --- a/spd/pretrain/configs/gpt2_simple-2L.yaml +++ b/spd/pretrain/configs/gpt2_simple-2L.yaml @@ -27,7 +27,7 @@ train_dataset_config: hf_tokenizer_path: SimpleStories/test-SimpleStories-gpt2-1.25M split: train streaming: false - n_ctx: 513 + n_ctx: 513 # model block_size + 1 for next-token label indexing seed: 0 column_name: story @@ -37,6 +37,6 @@ val_dataset_config: hf_tokenizer_path: SimpleStories/test-SimpleStories-gpt2-1.25M split: test streaming: false - n_ctx: 513 + n_ctx: 513 # model block_size + 1 for next-token label indexing seed: 0 column_name: story diff --git a/spd/pretrain/configs/owt_llama_simple_mlp-12L-768.yaml b/spd/pretrain/configs/owt_llama_simple_mlp-12L-768.yaml index 969a9c938..b969660ee 100644 --- a/spd/pretrain/configs/owt_llama_simple_mlp-12L-768.yaml +++ b/spd/pretrain/configs/owt_llama_simple_mlp-12L-768.yaml @@ -31,7 +31,7 @@ train_dataset_config: hf_tokenizer_path: gpt2 split: train[:10000000] streaming: false - n_ctx: 513 + n_ctx: 513 # model n_ctx + 1 for next-token label indexing seed: 0 column_name: text @@ -41,6 +41,6 @@ val_dataset_config: hf_tokenizer_path: gpt2 split: train[-100000:] streaming: false - n_ctx: 513 + n_ctx: 513 # model n_ctx + 1 for next-token label indexing seed: 0 column_name: text diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-12L-768.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-12L-768.yaml index a1a12a9cc..5336408ef 100644 --- a/spd/pretrain/configs/pile_llama_simple_mlp-12L-768.yaml +++ b/spd/pretrain/configs/pile_llama_simple_mlp-12L-768.yaml @@ -31,7 +31,7 @@ train_dataset_config: hf_tokenizer_path: EleutherAI/gpt-neox-20b split: train streaming: false - n_ctx: 513 + n_ctx: 513 # model n_ctx + 1 for next-token label indexing seed: 0 column_name: input_ids @@ -41,7 +41,7 @@ val_dataset_config: hf_tokenizer_path: EleutherAI/gpt-neox-20b split: val streaming: false - n_ctx: 513 + n_ctx: 513 # model n_ctx + 1 for next-token label indexing seed: 0 column_name: input_ids diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-1L-128.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-1L-128.yaml index 326f1f7fb..d6d05db3c 100644 --- a/spd/pretrain/configs/pile_llama_simple_mlp-1L-128.yaml +++ b/spd/pretrain/configs/pile_llama_simple_mlp-1L-128.yaml @@ -31,7 +31,7 @@ train_dataset_config: hf_tokenizer_path: EleutherAI/gpt-neox-20b split: train streaming: false - n_ctx: 513 + n_ctx: 513 # model n_ctx + 1 for next-token label indexing seed: 0 column_name: input_ids @@ -41,6 +41,6 @@ val_dataset_config: hf_tokenizer_path: EleutherAI/gpt-neox-20b split: val streaming: false - n_ctx: 513 + n_ctx: 513 # model n_ctx + 1 for next-token label indexing seed: 0 column_name: input_ids diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-2L-2048.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-2L-2048.yaml index d59d126b1..2b22b7c4c 100644 --- a/spd/pretrain/configs/pile_llama_simple_mlp-2L-2048.yaml +++ b/spd/pretrain/configs/pile_llama_simple_mlp-2L-2048.yaml @@ -40,7 +40,7 @@ train_dataset_config: hf_tokenizer_path: EleutherAI/gpt-neox-20b split: train streaming: false - n_ctx: 513 + n_ctx: 513 # model n_ctx + 1 for next-token label indexing seed: 0 column_name: input_ids @@ -50,6 +50,6 @@ val_dataset_config: hf_tokenizer_path: EleutherAI/gpt-neox-20b split: val streaming: false - n_ctx: 513 + n_ctx: 513 # model n_ctx + 1 for next-token label indexing seed: 0 column_name: input_ids diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-2L-768.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-2L-768.yaml index 3c9c14ef5..488272302 100644 --- a/spd/pretrain/configs/pile_llama_simple_mlp-2L-768.yaml +++ b/spd/pretrain/configs/pile_llama_simple_mlp-2L-768.yaml @@ -37,7 +37,7 @@ train_dataset_config: hf_tokenizer_path: EleutherAI/gpt-neox-20b split: train streaming: false - n_ctx: 513 + n_ctx: 513 # model n_ctx + 1 for next-token label indexing seed: 0 column_name: input_ids @@ -47,6 +47,6 @@ val_dataset_config: hf_tokenizer_path: EleutherAI/gpt-neox-20b split: val streaming: false - n_ctx: 513 + n_ctx: 513 # model n_ctx + 1 for next-token label indexing seed: 0 column_name: input_ids diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-4L-768.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-4L-768.yaml index 315d1309a..6aa5badc7 100644 --- a/spd/pretrain/configs/pile_llama_simple_mlp-4L-768.yaml +++ b/spd/pretrain/configs/pile_llama_simple_mlp-4L-768.yaml @@ -38,7 +38,7 @@ train_dataset_config: hf_tokenizer_path: EleutherAI/gpt-neox-20b split: train streaming: false - n_ctx: 513 + n_ctx: 513 # model n_ctx + 1 for next-token label indexing seed: 0 column_name: input_ids @@ -48,7 +48,7 @@ val_dataset_config: hf_tokenizer_path: EleutherAI/gpt-neox-20b split: val streaming: false - n_ctx: 513 + n_ctx: 513 # model n_ctx + 1 for next-token label indexing seed: 0 column_name: input_ids diff --git a/spd/pretrain/configs/ss_llama_simple_mlp-2L-128.yaml b/spd/pretrain/configs/ss_llama_simple_mlp-2L-128.yaml index 86ddab606..71be44f65 100644 --- a/spd/pretrain/configs/ss_llama_simple_mlp-2L-128.yaml +++ b/spd/pretrain/configs/ss_llama_simple_mlp-2L-128.yaml @@ -31,7 +31,7 @@ train_dataset_config: hf_tokenizer_path: SimpleStories/test-SimpleStories-gpt2-1.25M split: train streaming: false - n_ctx: 513 + n_ctx: 513 # model n_ctx + 1 for next-token label indexing seed: 0 column_name: story @@ -41,6 +41,6 @@ val_dataset_config: hf_tokenizer_path: SimpleStories/test-SimpleStories-gpt2-1.25M split: test streaming: false - n_ctx: 513 + n_ctx: 513 # model n_ctx + 1 for next-token label indexing seed: 0 column_name: story diff --git a/spd/pretrain/configs/ss_llama_simple_mlp-4L-192.yaml b/spd/pretrain/configs/ss_llama_simple_mlp-4L-192.yaml index 3ee32bb1d..c7fa2b053 100644 --- a/spd/pretrain/configs/ss_llama_simple_mlp-4L-192.yaml +++ b/spd/pretrain/configs/ss_llama_simple_mlp-4L-192.yaml @@ -31,7 +31,7 @@ train_dataset_config: hf_tokenizer_path: SimpleStories/test-SimpleStories-gpt2-1.25M split: train streaming: false - n_ctx: 513 + n_ctx: 513 # model n_ctx + 1 for next-token label indexing seed: 0 column_name: story @@ -41,6 +41,6 @@ val_dataset_config: hf_tokenizer_path: SimpleStories/test-SimpleStories-gpt2-1.25M split: test streaming: false - n_ctx: 513 + n_ctx: 513 # model n_ctx + 1 for next-token label indexing seed: 0 column_name: story From 7334df5c32b4f4292abbc8e2ed8d5568a5a5fd61 Mon Sep 17 00:00:00 2001 From: Dan Braun Date: Mon, 9 Feb 2026 14:09:38 +0000 Subject: [PATCH 5/5] Add temp streaming=True for pile config --- spd/experiments/lm/pile_llama_simple_mlp-2L.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml b/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml index b88f9b433..6341a1864 100644 --- a/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml +++ b/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml @@ -96,7 +96,7 @@ pretrained_model_output_attr: idx_0 tokenizer_name: EleutherAI/gpt-neox-20b task_config: task_name: lm - max_seq_len: 512 + max_seq_len: 512 # Temporary. Later we will do n_ctx=513 for the dataset and streaming=false buffer_size: 1000 dataset_name: danbraunai/pile-uncopyrighted-tok column_name: input_ids @@ -104,4 +104,4 @@ task_config: eval_data_split: val shuffle_each_epoch: true is_tokenized: true - streaming: false \ No newline at end of file + streaming: true # Temporary. Later we will do n_ctx=513 for the dataset and streaming=false \ No newline at end of file