From 07fcb66fddd06a621fe4e0c81bcfc88015afe87a Mon Sep 17 00:00:00 2001
From: Dan Braun <dan.braun@goodfire.ai>
Date: Mon, 9 Feb 2026 09:16:26 +0000
Subject: [PATCH 1/5] Setup tokenized pile dataset

---
 spd/data.py                                      |  9 +++++++--
 spd/experiments/lm/pile_llama_simple_mlp-2L.yaml |  6 +++---
 spd/pretrain/configs/gpt2_simple-2L.yaml         |  4 ++--
 .../configs/owt_llama_simple_mlp-12L-768.yaml    |  4 ++--
 .../configs/pile_llama_simple_mlp-12L-768.yaml   | 12 ++++++------
 .../configs/pile_llama_simple_mlp-2L-2048.yaml   | 12 ++++++------
 .../configs/pile_llama_simple_mlp-2L-768.yaml    | 16 ++++++++--------
 .../configs/pile_llama_simple_mlp-4L-768.yaml    | 16 ++++++++--------
 .../configs/ss_llama_simple_mlp-2L-128.yaml      |  4 ++--
 .../configs/ss_llama_simple_mlp-4L-192.yaml      |  4 ++--
 spd/pretrain/train.py                            | 14 +++-----------
 11 files changed, 49 insertions(+), 52 deletions(-)

diff --git a/spd/data.py b/spd/data.py
index 139e773a8..ed2285f94 100644
--- a/spd/data.py
+++ b/spd/data.py
@@ -223,9 +223,14 @@ def create_data_loader(
         assert isinstance(sample, Tensor) and sample.ndim == 1, (
             f"Expected the dataset to be tokenized. Got type {type(sample)}"
         )
-        assert len(sample) == dataset_config.n_ctx, (
-            f"n_ctx ({dataset_config.n_ctx}) does not match the tokenized length ({len(sample)})."
+        tokenized_len = len(sample)
+        assert dataset_config.n_ctx <= tokenized_len, (
+            f"n_ctx ({dataset_config.n_ctx}) is larger than the tokenized length ({tokenized_len})."
         )
+        if dataset_config.n_ctx < tokenized_len:
+            col = dataset_config.column_name
+            n_ctx = dataset_config.n_ctx
+            torch_dataset = dataset.map(lambda x: {col: x[col][:n_ctx]}).with_format("torch")
     else:
         to_lower = "SimpleStories" in dataset_config.name
         torch_dataset = tokenize_and_concatenate(
diff --git a/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml b/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml
index 3fed070fc..7b3b0d4e9 100644
--- a/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml
+++ b/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml
@@ -98,10 +98,10 @@ task_config:
   task_name: lm
   max_seq_len: 512
   buffer_size: 1000
-  dataset_name: monology/pile-uncopyrighted
+  dataset_name: danbraunai/pile-uncopyrighted
   column_name: text
-  train_data_split: train[:10000000]
-  eval_data_split: train[-100000:]
+  train_data_split: train
+  eval_data_split: val
   shuffle_each_epoch: true
   is_tokenized: false
   streaming: false
\ No newline at end of file
diff --git a/spd/pretrain/configs/gpt2_simple-2L.yaml b/spd/pretrain/configs/gpt2_simple-2L.yaml
index 145663be7..5f570faca 100644
--- a/spd/pretrain/configs/gpt2_simple-2L.yaml
+++ b/spd/pretrain/configs/gpt2_simple-2L.yaml
@@ -27,7 +27,7 @@ train_dataset_config:
   hf_tokenizer_path: SimpleStories/test-SimpleStories-gpt2-1.25M
   split: train
   streaming: false
-  n_ctx: 512
+  n_ctx: 513
   seed: 0
   column_name: story
 
@@ -37,6 +37,6 @@ val_dataset_config:
   hf_tokenizer_path: SimpleStories/test-SimpleStories-gpt2-1.25M
   split: test
   streaming: false
-  n_ctx: 512
+  n_ctx: 513
   seed: 0
   column_name: story
diff --git a/spd/pretrain/configs/owt_llama_simple_mlp-12L-768.yaml b/spd/pretrain/configs/owt_llama_simple_mlp-12L-768.yaml
index 038187444..969a9c938 100644
--- a/spd/pretrain/configs/owt_llama_simple_mlp-12L-768.yaml
+++ b/spd/pretrain/configs/owt_llama_simple_mlp-12L-768.yaml
@@ -31,7 +31,7 @@ train_dataset_config:
   hf_tokenizer_path: gpt2
   split: train[:10000000]
   streaming: false
-  n_ctx: 512
+  n_ctx: 513
   seed: 0
   column_name: text
 
@@ -41,6 +41,6 @@ val_dataset_config:
   hf_tokenizer_path: gpt2
   split: train[-100000:]
   streaming: false
-  n_ctx: 512
+  n_ctx: 513
   seed: 0
   column_name: text
diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-12L-768.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-12L-768.yaml
index 6571f69c7..b0b07a9cd 100644
--- a/spd/pretrain/configs/pile_llama_simple_mlp-12L-768.yaml
+++ b/spd/pretrain/configs/pile_llama_simple_mlp-12L-768.yaml
@@ -26,21 +26,21 @@ model:
   flash_attention: false
 
 train_dataset_config:
-  name: monology/pile-uncopyrighted
+  name: danbraunai/pile-uncopyrighted
   is_tokenized: false
   hf_tokenizer_path: EleutherAI/gpt-neox-20b
-  split: train[:100000]  # Dataset has 177M examples
+  split: train
   streaming: false
-  n_ctx: 512
+  n_ctx: 513
   seed: 0
   column_name: text
 
 val_dataset_config:
-  name: monology/pile-uncopyrighted
+  name: danbraunai/pile-uncopyrighted
   is_tokenized: false
   hf_tokenizer_path: EleutherAI/gpt-neox-20b
-  split: train[-1000000:]
+  split: val
   streaming: false
-  n_ctx: 512
+  n_ctx: 513
   seed: 0
   column_name: text
diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-2L-2048.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-2L-2048.yaml
index 380360f90..dc8e97a3a 100644
--- a/spd/pretrain/configs/pile_llama_simple_mlp-2L-2048.yaml
+++ b/spd/pretrain/configs/pile_llama_simple_mlp-2L-2048.yaml
@@ -35,21 +35,21 @@ model:
   vocab_size: 50277
 
 train_dataset_config:
-  name: monology/pile-uncopyrighted
+  name: danbraunai/pile-uncopyrighted
   is_tokenized: false
   hf_tokenizer_path: EleutherAI/gpt-neox-20b
-  split: train[:100000]  # Dataset has 177M examples
+  split: train
   streaming: false
-  n_ctx: 512
+  n_ctx: 513
   seed: 0
   column_name: text
 
 val_dataset_config:
-  name: monology/pile-uncopyrighted
+  name: danbraunai/pile-uncopyrighted
   is_tokenized: false
   hf_tokenizer_path: EleutherAI/gpt-neox-20b
-  split: train[-100000:]
+  split: val
   streaming: false
-  n_ctx: 512
+  n_ctx: 513
   seed: 0
   column_name: text
diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-2L-768.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-2L-768.yaml
index 1bbc636df..8df3d0699 100644
--- a/spd/pretrain/configs/pile_llama_simple_mlp-2L-768.yaml
+++ b/spd/pretrain/configs/pile_llama_simple_mlp-2L-768.yaml
@@ -32,21 +32,21 @@ model:
   vocab_size: 50277
 
 train_dataset_config:
-  name: monology/pile-uncopyrighted
-  is_tokenized: false
+  name: danbraunai/pile-uncopyrighted-tok
+  is_tokenized: true
   hf_tokenizer_path: EleutherAI/gpt-neox-20b
-  split: train[:-100000]  # Dataset has 177M examples
+  split: train
   streaming: false
-  n_ctx: 512
+  n_ctx: 513
   seed: 0
   column_name: text
 
 val_dataset_config:
-  name: monology/pile-uncopyrighted
-  is_tokenized: false
+  name: danbraunai/pile-uncopyrighted-tok
+  is_tokenized: true
   hf_tokenizer_path: EleutherAI/gpt-neox-20b
-  split: train[-100000:]
+  split: val
   streaming: false
-  n_ctx: 512
+  n_ctx: 513
   seed: 0
   column_name: text
diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-4L-768.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-4L-768.yaml
index 7b33d348a..ffbe4c143 100644
--- a/spd/pretrain/configs/pile_llama_simple_mlp-4L-768.yaml
+++ b/spd/pretrain/configs/pile_llama_simple_mlp-4L-768.yaml
@@ -33,21 +33,21 @@ model:
   vocab_size: 50277
 
 train_dataset_config:
-  name: monology/pile-uncopyrighted
-  is_tokenized: false
+  name: danbraunai/pile-uncopyrighted-tok
+  is_tokenized: true
   hf_tokenizer_path: EleutherAI/gpt-neox-20b
-  split: train[:100000] # Dataset has 177M examples
+  split: train
   streaming: false
-  n_ctx: 512
+  n_ctx: 513
   seed: 0
   column_name: text
 
 val_dataset_config:
-  name: monology/pile-uncopyrighted
-  is_tokenized: false
+  name: danbraunai/pile-uncopyrighted-tok
+  is_tokenized: true
   hf_tokenizer_path: EleutherAI/gpt-neox-20b
-  split: train[-100000:]
+  split: val
   streaming: false
-  n_ctx: 512
+  n_ctx: 513
   seed: 0
   column_name: text
diff --git a/spd/pretrain/configs/ss_llama_simple_mlp-2L-128.yaml b/spd/pretrain/configs/ss_llama_simple_mlp-2L-128.yaml
index f3f3b5390..86ddab606 100644
--- a/spd/pretrain/configs/ss_llama_simple_mlp-2L-128.yaml
+++ b/spd/pretrain/configs/ss_llama_simple_mlp-2L-128.yaml
@@ -31,7 +31,7 @@ train_dataset_config:
   hf_tokenizer_path: SimpleStories/test-SimpleStories-gpt2-1.25M
   split: train
   streaming: false
-  n_ctx: 512
+  n_ctx: 513
   seed: 0
   column_name: story
 
@@ -41,6 +41,6 @@ val_dataset_config:
   hf_tokenizer_path: SimpleStories/test-SimpleStories-gpt2-1.25M
   split: test
   streaming: false
-  n_ctx: 512
+  n_ctx: 513
   seed: 0
   column_name: story
diff --git a/spd/pretrain/configs/ss_llama_simple_mlp-4L-192.yaml b/spd/pretrain/configs/ss_llama_simple_mlp-4L-192.yaml
index b68927c00..3ee32bb1d 100644
--- a/spd/pretrain/configs/ss_llama_simple_mlp-4L-192.yaml
+++ b/spd/pretrain/configs/ss_llama_simple_mlp-4L-192.yaml
@@ -31,7 +31,7 @@ train_dataset_config:
   hf_tokenizer_path: SimpleStories/test-SimpleStories-gpt2-1.25M
   split: train
   streaming: false
-  n_ctx: 512
+  n_ctx: 513
   seed: 0
   column_name: story
 
@@ -41,6 +41,6 @@ val_dataset_config:
   hf_tokenizer_path: SimpleStories/test-SimpleStories-gpt2-1.25M
   split: test
   streaming: false
-  n_ctx: 512
+  n_ctx: 513
   seed: 0
   column_name: story
diff --git a/spd/pretrain/train.py b/spd/pretrain/train.py
index bd5fb9b12..73d0d6725 100644
--- a/spd/pretrain/train.py
+++ b/spd/pretrain/train.py
@@ -206,15 +206,7 @@ def main(config_path_or_obj: Path | str | Config | None = None) -> None:
     load_dotenv(override=True)
     config = load_config(config_path_or_obj, config_model=Config)
 
-    T = config.train_dataset_config.n_ctx  # Training sequence length (positions to train on)
-
-    # Load n_ctx+1 tokens so we can train on n_ctx positions (need extra token for labels)
-    train_dataset_config = config.train_dataset_config.model_copy(
-        update={"n_ctx": config.train_dataset_config.n_ctx + 1}
-    )
-    val_dataset_config = config.val_dataset_config.model_copy(
-        update={"n_ctx": config.val_dataset_config.n_ctx + 1}
-    )
+    T = config.train_dataset_config.n_ctx - 1  # Training sequence length (positions to train on)
 
     # set up DDP (distributed data parallel). torchrun sets this env variable
     ddp = int(os.environ.get("RANK", -1)) != -1
@@ -304,7 +296,7 @@ def main(config_path_or_obj: Path | str | Config | None = None) -> None:
         model = cast(nn.Module, torch.compile(model))  # type: ignore[reportArgumentType]
 
     train_loader, train_tokenizer = create_data_loader(
-        dataset_config=train_dataset_config,
+        dataset_config=config.train_dataset_config,
         batch_size=B,
         buffer_size=1000,
         global_seed=0,
@@ -313,7 +305,7 @@ def main(config_path_or_obj: Path | str | Config | None = None) -> None:
     train_iter = iter(train_loader)
 
     val_loader, _ = create_data_loader(
-        dataset_config=val_dataset_config,
+        dataset_config=config.val_dataset_config,
         batch_size=B,
         buffer_size=1000,
         global_seed=0,

From 3c79f399136ce18c4e73a5f7229ff56c65c70202 Mon Sep 17 00:00:00 2001
From: Dan Braun <dan.braun@goodfire.ai>
Date: Mon, 9 Feb 2026 09:22:15 +0000
Subject: [PATCH 2/5] Update spd pile config

---
 spd/experiments/lm/pile_llama_simple_mlp-2L.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml b/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml
index 7b3b0d4e9..ac2ebad80 100644
--- a/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml
+++ b/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml
@@ -93,15 +93,15 @@ pretrained_model_class: spd.pretrain.models.llama_simple_mlp.LlamaSimpleMLP
 pretrained_model_path: null
 pretrained_model_name: goodfire/spd/runs/ivdw6l06
 pretrained_model_output_attr: idx_0
-tokenizer_name: gpt2
+tokenizer_name: EleutherAI/gpt-neox-20b
 task_config:
   task_name: lm
   max_seq_len: 512
   buffer_size: 1000
-  dataset_name: danbraunai/pile-uncopyrighted
+  dataset_name: danbraunai/pile-uncopyrighted-tok
   column_name: text
   train_data_split: train
   eval_data_split: val
   shuffle_each_epoch: true
-  is_tokenized: false
+  is_tokenized: true
   streaming: false
\ No newline at end of file

From 53838b3b94b6a166f785aef47a86b889a0e45429 Mon Sep 17 00:00:00 2001
From: Dan Braun <dan.braun@goodfire.ai>
Date: Mon, 9 Feb 2026 10:15:13 +0000
Subject: [PATCH 3/5] Update configs

---
 .../lm/pile_llama_simple_mlp-2L.yaml          |  2 +-
 .../pile_llama_simple_mlp-12L-768.yaml        | 13 +++---
 .../configs/pile_llama_simple_mlp-1L-128.yaml | 46 +++++++++++++++++++
 .../pile_llama_simple_mlp-2L-2048.yaml        | 12 ++---
 .../configs/pile_llama_simple_mlp-2L-768.yaml |  4 +-
 .../configs/pile_llama_simple_mlp-4L-768.yaml |  5 +-
 6 files changed, 65 insertions(+), 17 deletions(-)
 create mode 100644 spd/pretrain/configs/pile_llama_simple_mlp-1L-128.yaml

diff --git a/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml b/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml
index ac2ebad80..b88f9b433 100644
--- a/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml
+++ b/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml
@@ -99,7 +99,7 @@ task_config:
   max_seq_len: 512
   buffer_size: 1000
   dataset_name: danbraunai/pile-uncopyrighted-tok
-  column_name: text
+  column_name: input_ids
   train_data_split: train
   eval_data_split: val
   shuffle_each_epoch: true
diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-12L-768.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-12L-768.yaml
index b0b07a9cd..a1a12a9cc 100644
--- a/spd/pretrain/configs/pile_llama_simple_mlp-12L-768.yaml
+++ b/spd/pretrain/configs/pile_llama_simple_mlp-12L-768.yaml
@@ -26,21 +26,22 @@ model:
   flash_attention: false
 
 train_dataset_config:
-  name: danbraunai/pile-uncopyrighted
-  is_tokenized: false
+  name: danbraunai/pile-uncopyrighted-tok
+  is_tokenized: true
   hf_tokenizer_path: EleutherAI/gpt-neox-20b
   split: train
   streaming: false
   n_ctx: 513
   seed: 0
-  column_name: text
+  column_name: input_ids
 
 val_dataset_config:
-  name: danbraunai/pile-uncopyrighted
-  is_tokenized: false
+  name: danbraunai/pile-uncopyrighted-tok
+  is_tokenized: true
   hf_tokenizer_path: EleutherAI/gpt-neox-20b
   split: val
   streaming: false
   n_ctx: 513
   seed: 0
-  column_name: text
+  column_name: input_ids
+
diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-1L-128.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-1L-128.yaml
new file mode 100644
index 000000000..326f1f7fb
--- /dev/null
+++ b/spd/pretrain/configs/pile_llama_simple_mlp-1L-128.yaml
@@ -0,0 +1,46 @@
+wandb_project: spd
+dtype: bfloat16
+batch_size: 1024
+num_iterations: 100_000
+warmup_iters: 600
+learning_rate: 3e-4
+learning_rate_decay_frac: 0.1
+weight_decay: 0.1
+grad_clip: 1.0
+val_loss_every: 1000
+val_max_steps: 20
+sample_every: 1000
+intermediate_checkpoints: false
+
+model:
+  model_type: LlamaSimpleMLP
+  block_size: 512
+  vocab_size: 4019
+  n_layer: 2
+  n_head: 4
+  n_embd: 128
+  n_intermediate: 512  # 128 * 4
+  rotary_dim: 32       # 128 // 4
+  n_ctx: 512
+  n_key_value_heads: 2
+  flash_attention: false
+
+train_dataset_config:
+  name: danbraunai/pile-uncopyrighted-tok
+  is_tokenized: true
+  hf_tokenizer_path: EleutherAI/gpt-neox-20b
+  split: train
+  streaming: false
+  n_ctx: 513
+  seed: 0
+  column_name: input_ids
+
+val_dataset_config:
+  name: danbraunai/pile-uncopyrighted-tok
+  is_tokenized: true
+  hf_tokenizer_path: EleutherAI/gpt-neox-20b
+  split: val
+  streaming: false
+  n_ctx: 513
+  seed: 0
+  column_name: input_ids
diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-2L-2048.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-2L-2048.yaml
index dc8e97a3a..d59d126b1 100644
--- a/spd/pretrain/configs/pile_llama_simple_mlp-2L-2048.yaml
+++ b/spd/pretrain/configs/pile_llama_simple_mlp-2L-2048.yaml
@@ -35,21 +35,21 @@ model:
   vocab_size: 50277
 
 train_dataset_config:
-  name: danbraunai/pile-uncopyrighted
-  is_tokenized: false
+  name: danbraunai/pile-uncopyrighted-tok
+  is_tokenized: true
   hf_tokenizer_path: EleutherAI/gpt-neox-20b
   split: train
   streaming: false
   n_ctx: 513
   seed: 0
-  column_name: text
+  column_name: input_ids
 
 val_dataset_config:
-  name: danbraunai/pile-uncopyrighted
-  is_tokenized: false
+  name: danbraunai/pile-uncopyrighted-tok
+  is_tokenized: true
   hf_tokenizer_path: EleutherAI/gpt-neox-20b
   split: val
   streaming: false
   n_ctx: 513
   seed: 0
-  column_name: text
+  column_name: input_ids
diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-2L-768.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-2L-768.yaml
index 8df3d0699..3c9c14ef5 100644
--- a/spd/pretrain/configs/pile_llama_simple_mlp-2L-768.yaml
+++ b/spd/pretrain/configs/pile_llama_simple_mlp-2L-768.yaml
@@ -39,7 +39,7 @@ train_dataset_config:
   streaming: false
   n_ctx: 513
   seed: 0
-  column_name: text
+  column_name: input_ids
 
 val_dataset_config:
   name: danbraunai/pile-uncopyrighted-tok
@@ -49,4 +49,4 @@ val_dataset_config:
   streaming: false
   n_ctx: 513
   seed: 0
-  column_name: text
+  column_name: input_ids
diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-4L-768.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-4L-768.yaml
index ffbe4c143..315d1309a 100644
--- a/spd/pretrain/configs/pile_llama_simple_mlp-4L-768.yaml
+++ b/spd/pretrain/configs/pile_llama_simple_mlp-4L-768.yaml
@@ -40,7 +40,7 @@ train_dataset_config:
   streaming: false
   n_ctx: 513
   seed: 0
-  column_name: text
+  column_name: input_ids
 
 val_dataset_config:
   name: danbraunai/pile-uncopyrighted-tok
@@ -50,4 +50,5 @@ val_dataset_config:
   streaming: false
   n_ctx: 513
   seed: 0
-  column_name: text
+  column_name: input_ids
+

From c4eddcddd18837ee4608f6275cdc9c7e6dcbd7ef Mon Sep 17 00:00:00 2001
From: Dan Braun <dan.braun@goodfire.ai>
Date: Mon, 9 Feb 2026 13:56:07 +0000
Subject: [PATCH 4/5] Update docs for datset n_ctx

---
 spd/data.py                                       | 15 ++++++++-------
 spd/pretrain/CLAUDE.md                            |  7 +++++++
 spd/pretrain/configs/gpt2_simple-2L.yaml          |  4 ++--
 .../configs/owt_llama_simple_mlp-12L-768.yaml     |  4 ++--
 .../configs/pile_llama_simple_mlp-12L-768.yaml    |  4 ++--
 .../configs/pile_llama_simple_mlp-1L-128.yaml     |  4 ++--
 .../configs/pile_llama_simple_mlp-2L-2048.yaml    |  4 ++--
 .../configs/pile_llama_simple_mlp-2L-768.yaml     |  4 ++--
 .../configs/pile_llama_simple_mlp-4L-768.yaml     |  4 ++--
 .../configs/ss_llama_simple_mlp-2L-128.yaml       |  4 ++--
 .../configs/ss_llama_simple_mlp-4L-192.yaml       |  4 ++--
 11 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/spd/data.py b/spd/data.py
index ed2285f94..f55703811 100644
--- a/spd/data.py
+++ b/spd/data.py
@@ -16,14 +16,15 @@
 
 
 class DatasetConfig(BaseConfig):
-    name: str = "lennart-finke/SimpleStories"
-    is_tokenized: bool = True
-    hf_tokenizer_path: str | None = None
-    streaming: bool = False
-    split: str = "train"
-    n_ctx: int = 1024
+    name: str
+    is_tokenized: bool
+    hf_tokenizer_path: str | None
+    streaming: bool
+    split: str
+    n_ctx: int
+    """Must be model n_ctx + 1 to provide room for next-token label indexing."""
     seed: int | None = None
-    column_name: str = "input_ids"
+    column_name: str
     """The name of the column in the dataset that contains the data (tokenized or non-tokenized).
     Typically 'input_ids' for datasets stored with e2e_sae/scripts/upload_hf_dataset.py, or "tokens"
     for datasets tokenized in TransformerLens (e.g. NeelNanda/pile-10k)."""
diff --git a/spd/pretrain/CLAUDE.md b/spd/pretrain/CLAUDE.md
index 568bc4ad0..92a02c60a 100644
--- a/spd/pretrain/CLAUDE.md
+++ b/spd/pretrain/CLAUDE.md
@@ -38,6 +38,13 @@ spd-pretrain --config_path ... --n_gpus 4
 - **SimpleStories**: `SimpleStories/test-SimpleStories-gpt2-1.25M` (vocab size: 4019)
 - **Pile/OpenWebText**: `gpt2` (vocab size: 50257)
 
+## Dataset n_ctx vs Model n_ctx
+
+The dataset `n_ctx` must be **model n_ctx + 1**. During training, sequences are split into
+input `[:, :-1]` and target `[:, 1:]` for next-token prediction, so the extra token provides
+room for label indexing. For example, if the model has `n_ctx: 512`, the dataset should have
+`n_ctx: 513`.
+
 ## Key Files
 
 - `train.py` - Main training loop with DDP support
diff --git a/spd/pretrain/configs/gpt2_simple-2L.yaml b/spd/pretrain/configs/gpt2_simple-2L.yaml
index 5f570faca..8bc45aafc 100644
--- a/spd/pretrain/configs/gpt2_simple-2L.yaml
+++ b/spd/pretrain/configs/gpt2_simple-2L.yaml
@@ -27,7 +27,7 @@ train_dataset_config:
   hf_tokenizer_path: SimpleStories/test-SimpleStories-gpt2-1.25M
   split: train
   streaming: false
-  n_ctx: 513
+  n_ctx: 513  # model block_size + 1 for next-token label indexing
   seed: 0
   column_name: story
 
@@ -37,6 +37,6 @@ val_dataset_config:
   hf_tokenizer_path: SimpleStories/test-SimpleStories-gpt2-1.25M
   split: test
   streaming: false
-  n_ctx: 513
+  n_ctx: 513  # model block_size + 1 for next-token label indexing
   seed: 0
   column_name: story
diff --git a/spd/pretrain/configs/owt_llama_simple_mlp-12L-768.yaml b/spd/pretrain/configs/owt_llama_simple_mlp-12L-768.yaml
index 969a9c938..b969660ee 100644
--- a/spd/pretrain/configs/owt_llama_simple_mlp-12L-768.yaml
+++ b/spd/pretrain/configs/owt_llama_simple_mlp-12L-768.yaml
@@ -31,7 +31,7 @@ train_dataset_config:
   hf_tokenizer_path: gpt2
   split: train[:10000000]
   streaming: false
-  n_ctx: 513
+  n_ctx: 513  # model n_ctx + 1 for next-token label indexing
   seed: 0
   column_name: text
 
@@ -41,6 +41,6 @@ val_dataset_config:
   hf_tokenizer_path: gpt2
   split: train[-100000:]
   streaming: false
-  n_ctx: 513
+  n_ctx: 513  # model n_ctx + 1 for next-token label indexing
   seed: 0
   column_name: text
diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-12L-768.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-12L-768.yaml
index a1a12a9cc..5336408ef 100644
--- a/spd/pretrain/configs/pile_llama_simple_mlp-12L-768.yaml
+++ b/spd/pretrain/configs/pile_llama_simple_mlp-12L-768.yaml
@@ -31,7 +31,7 @@ train_dataset_config:
   hf_tokenizer_path: EleutherAI/gpt-neox-20b
   split: train
   streaming: false
-  n_ctx: 513
+  n_ctx: 513  # model n_ctx + 1 for next-token label indexing
   seed: 0
   column_name: input_ids
 
@@ -41,7 +41,7 @@ val_dataset_config:
   hf_tokenizer_path: EleutherAI/gpt-neox-20b
   split: val
   streaming: false
-  n_ctx: 513
+  n_ctx: 513  # model n_ctx + 1 for next-token label indexing
   seed: 0
   column_name: input_ids
 
diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-1L-128.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-1L-128.yaml
index 326f1f7fb..d6d05db3c 100644
--- a/spd/pretrain/configs/pile_llama_simple_mlp-1L-128.yaml
+++ b/spd/pretrain/configs/pile_llama_simple_mlp-1L-128.yaml
@@ -31,7 +31,7 @@ train_dataset_config:
   hf_tokenizer_path: EleutherAI/gpt-neox-20b
   split: train
   streaming: false
-  n_ctx: 513
+  n_ctx: 513  # model n_ctx + 1 for next-token label indexing
   seed: 0
   column_name: input_ids
 
@@ -41,6 +41,6 @@ val_dataset_config:
   hf_tokenizer_path: EleutherAI/gpt-neox-20b
   split: val
   streaming: false
-  n_ctx: 513
+  n_ctx: 513  # model n_ctx + 1 for next-token label indexing
   seed: 0
   column_name: input_ids
diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-2L-2048.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-2L-2048.yaml
index d59d126b1..2b22b7c4c 100644
--- a/spd/pretrain/configs/pile_llama_simple_mlp-2L-2048.yaml
+++ b/spd/pretrain/configs/pile_llama_simple_mlp-2L-2048.yaml
@@ -40,7 +40,7 @@ train_dataset_config:
   hf_tokenizer_path: EleutherAI/gpt-neox-20b
   split: train
   streaming: false
-  n_ctx: 513
+  n_ctx: 513  # model n_ctx + 1 for next-token label indexing
   seed: 0
   column_name: input_ids
 
@@ -50,6 +50,6 @@ val_dataset_config:
   hf_tokenizer_path: EleutherAI/gpt-neox-20b
   split: val
   streaming: false
-  n_ctx: 513
+  n_ctx: 513  # model n_ctx + 1 for next-token label indexing
   seed: 0
   column_name: input_ids
diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-2L-768.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-2L-768.yaml
index 3c9c14ef5..488272302 100644
--- a/spd/pretrain/configs/pile_llama_simple_mlp-2L-768.yaml
+++ b/spd/pretrain/configs/pile_llama_simple_mlp-2L-768.yaml
@@ -37,7 +37,7 @@ train_dataset_config:
   hf_tokenizer_path: EleutherAI/gpt-neox-20b
   split: train
   streaming: false
-  n_ctx: 513
+  n_ctx: 513  # model n_ctx + 1 for next-token label indexing
   seed: 0
   column_name: input_ids
 
@@ -47,6 +47,6 @@ val_dataset_config:
   hf_tokenizer_path: EleutherAI/gpt-neox-20b
   split: val
   streaming: false
-  n_ctx: 513
+  n_ctx: 513  # model n_ctx + 1 for next-token label indexing
   seed: 0
   column_name: input_ids
diff --git a/spd/pretrain/configs/pile_llama_simple_mlp-4L-768.yaml b/spd/pretrain/configs/pile_llama_simple_mlp-4L-768.yaml
index 315d1309a..6aa5badc7 100644
--- a/spd/pretrain/configs/pile_llama_simple_mlp-4L-768.yaml
+++ b/spd/pretrain/configs/pile_llama_simple_mlp-4L-768.yaml
@@ -38,7 +38,7 @@ train_dataset_config:
   hf_tokenizer_path: EleutherAI/gpt-neox-20b
   split: train
   streaming: false
-  n_ctx: 513
+  n_ctx: 513  # model n_ctx + 1 for next-token label indexing
   seed: 0
   column_name: input_ids
 
@@ -48,7 +48,7 @@ val_dataset_config:
   hf_tokenizer_path: EleutherAI/gpt-neox-20b
   split: val
   streaming: false
-  n_ctx: 513
+  n_ctx: 513  # model n_ctx + 1 for next-token label indexing
   seed: 0
   column_name: input_ids
 
diff --git a/spd/pretrain/configs/ss_llama_simple_mlp-2L-128.yaml b/spd/pretrain/configs/ss_llama_simple_mlp-2L-128.yaml
index 86ddab606..71be44f65 100644
--- a/spd/pretrain/configs/ss_llama_simple_mlp-2L-128.yaml
+++ b/spd/pretrain/configs/ss_llama_simple_mlp-2L-128.yaml
@@ -31,7 +31,7 @@ train_dataset_config:
   hf_tokenizer_path: SimpleStories/test-SimpleStories-gpt2-1.25M
   split: train
   streaming: false
-  n_ctx: 513
+  n_ctx: 513  # model n_ctx + 1 for next-token label indexing
   seed: 0
   column_name: story
 
@@ -41,6 +41,6 @@ val_dataset_config:
   hf_tokenizer_path: SimpleStories/test-SimpleStories-gpt2-1.25M
   split: test
   streaming: false
-  n_ctx: 513
+  n_ctx: 513  # model n_ctx + 1 for next-token label indexing
   seed: 0
   column_name: story
diff --git a/spd/pretrain/configs/ss_llama_simple_mlp-4L-192.yaml b/spd/pretrain/configs/ss_llama_simple_mlp-4L-192.yaml
index 3ee32bb1d..c7fa2b053 100644
--- a/spd/pretrain/configs/ss_llama_simple_mlp-4L-192.yaml
+++ b/spd/pretrain/configs/ss_llama_simple_mlp-4L-192.yaml
@@ -31,7 +31,7 @@ train_dataset_config:
   hf_tokenizer_path: SimpleStories/test-SimpleStories-gpt2-1.25M
   split: train
   streaming: false
-  n_ctx: 513
+  n_ctx: 513  # model n_ctx + 1 for next-token label indexing
   seed: 0
   column_name: story
 
@@ -41,6 +41,6 @@ val_dataset_config:
   hf_tokenizer_path: SimpleStories/test-SimpleStories-gpt2-1.25M
   split: test
   streaming: false
-  n_ctx: 513
+  n_ctx: 513  # model n_ctx + 1 for next-token label indexing
   seed: 0
   column_name: story

From 7334df5c32b4f4292abbc8e2ed8d5568a5a5fd61 Mon Sep 17 00:00:00 2001
From: Dan Braun <dan.braun@goodfire.ai>
Date: Mon, 9 Feb 2026 14:09:38 +0000
Subject: [PATCH 5/5] Add temp streaming=True for pile config

---
 spd/experiments/lm/pile_llama_simple_mlp-2L.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml b/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml
index b88f9b433..6341a1864 100644
--- a/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml
+++ b/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml
@@ -96,7 +96,7 @@ pretrained_model_output_attr: idx_0
 tokenizer_name: EleutherAI/gpt-neox-20b
 task_config:
   task_name: lm
-  max_seq_len: 512
+  max_seq_len: 512 # Temporary. Later we will do n_ctx=513 for the dataset and streaming=false
   buffer_size: 1000
   dataset_name: danbraunai/pile-uncopyrighted-tok
   column_name: input_ids
@@ -104,4 +104,4 @@ task_config:
   eval_data_split: val
   shuffle_each_epoch: true
   is_tokenized: true
-  streaming: false
\ No newline at end of file
+  streaming: true # Temporary. Later we will do n_ctx=513 for the dataset and streaming=false
\ No newline at end of file