miolini · MihailAT · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026
diff --git a/program.md b/program.md
@@ -99,7 +99,7 @@ LOOP FOREVER:
 4. Run the experiment: `uv run train.py > run.log 2>&1` (redirect everything — do NOT use tee or let output flood your context)
 5. Read out the results: `grep "^val_bpb:\|^peak_vram_mb:" run.log`
 6. If the grep output is empty, the run crashed. Run `tail -n 50 run.log` to read the Python stack trace and attempt a fix. If you can't get things to work after more than a few attempts, give up.
-7. Record the results in the tsv
+7. Always must Record the results in the tsv. Never delete results.tsv!
 8. If val_bpb improved (lower), you "advance" the branch, keeping the git commit
 9. If val_bpb is equal or worse, you git reset back to where you started
 

diff --git a/results.tsv b/results.tsv
@@ -0,0 +1,3 @@
+commit	val_bpb	memory_gb	status	description
+537c6e6	1.834839	0.0	keep	baseline
+35e3dec	1.731247	0.0	keep	reduce WARMDOWN_RATIO to 0.25
diff --git a/run.log b/run.log
@@ -0,0 +1,27 @@
+Environment verified: macOS detected with Metal (MPS) hardware acceleration available.
+
+Environment verified: macOS detected with Metal (MPS) hardware acceleration available.
+
+Vocab size: 8,192
+Model config: {'sequence_len': 2048, 'vocab_size': 8192, 'n_layer': 6, 'n_head': 3, 'n_kv_head': 3, 'n_embd': 384, 'window_pattern': 'L'}
+Parameter counts:
+  wte                     : 3,145,728
+  value_embeds            : 9,437,184
+  lm_head                 : 3,145,728
+  transformer_matrices    : 10,617,120
+  scalars                 : 12
+  total                   : 26,345,772
+Estimated FLOPs per token: 1.392002e+08
+Scaling AdamW LRs by 1/sqrt(384/768) = 1.414214
+Time budget: 300s
+Gradient accumulation steps: 2
+step 00000 (0.0%) | loss: 9.011027 | lrm: 1.00 | dt: 28217ms | tok/sec: 2,322 | mfu: 0.0% | epoch: 1 | remaining: 300s    step 00001 (0.0%) | loss: 9.115025 | lrm: 1.00 | dt: 23368ms | tok/sec: 2,804 | mfu: 0.0% | epoch: 1 | remaining: 300s    step 00002 (0.0%) | loss: 8.944727 | lrm: 1.00 | dt: 40823ms | tok/sec: 1,605 | mfu: 0.0% | epoch: 1 | remaining: 300s    step 00003 (0.0%) | loss: 8.714776 | lrm: 1.00 | dt: 73912ms | tok/sec: 886 | mfu: 0.0% | epoch: 1 | remaining: 300s    step 00004 (0.0%) | loss: 8.392169 | lrm: 1.00 | dt: 41292ms | tok/sec: 1,587 | mfu: 0.0% | epoch: 1 | remaining: 300s    step 00005 (0.0%) | loss: 8.095791 | lrm: 1.00 | dt: 96242ms | tok/sec: 680 | mfu: 0.0% | epoch: 1 | remaining: 300s    step 00006 (0.0%) | loss: 7.847162 | lrm: 1.00 | dt: 81709ms | tok/sec: 802 | mfu: 0.0% | epoch: 1 | remaining: 300s    step 00007 (0.0%) | loss: 7.631296 | lrm: 1.00 | dt: 77427ms | tok/sec: 846 | mfu: 0.0% | epoch: 1 | remaining: 300s    step 00008 (0.0%) | loss: 7.439391 | lrm: 1.00 | dt: 88897ms | tok/sec: 737 | mfu: 0.0% | epoch: 1 | remaining: 300s    Traceback (most recent call last):
+  File "/Users/misha/Python GENERAL/autoresearchMT/train.py", line 622, in <module>
+    loss.backward()
+  File "/Users/misha/Python GENERAL/autoresearchMT/.venv/lib/python3.10/site-packages/torch/_tensor.py", line 626, in backward
+    torch.autograd.backward(
+  File "/Users/misha/Python GENERAL/autoresearchMT/.venv/lib/python3.10/site-packages/torch/autograd/__init__.py", line 347, in backward
+    _engine_run_backward(
+  File "/Users/misha/Python GENERAL/autoresearchMT/.venv/lib/python3.10/site-packages/torch/autograd/graph.py", line 823, in _engine_run_backward
+    return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+RuntimeError: MPS backend out of memory (MPS allocated: 16.30 GB, other allocations: 1.76 GB, max allowed: 18.13 GB). Tried to allocate 1024.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

diff --git a/train.py b/train.py
@@ -493,11 +493,11 @@ def step(self):
 WEIGHT_DECAY = 0.2      # cautious weight decay for Muon
 ADAM_BETAS = (0.8, 0.95) # Adam beta1, beta2
 WARMUP_RATIO = 0.0      # fraction of time budget for LR warmup
-WARMDOWN_RATIO = 0.5    # fraction of time budget for LR warmdown
+WARMDOWN_RATIO = 0.25   # fraction of time budget for LR warmdown (was 0.5)
 FINAL_LR_FRAC = 0.0     # final LR as fraction of initial
 
 # Model size
-DEPTH = 4               # number of transformer layers
+DEPTH = 6               # number of transformer layers (increased from 4)
 DEVICE_BATCH_SIZE = 16  # per-device batch size (reduce if OOM)
 
 # ---------------------------------------------------------------------------