From 35e3deca62356acaf79293f236dd04797c3e404c Mon Sep 17 00:00:00 2001 From: Mihail Date: Wed, 15 Apr 2026 12:27:51 +0100 Subject: [PATCH 1/3] Experiment: reduce WARMDOWN_RATIO from 0.5 to 0.25 Spend 75% of time at full LR instead of 50%. More training at peak LR should help the model learn faster. --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 79d53d9fa4..236db6b31a 100644 --- a/train.py +++ b/train.py @@ -493,7 +493,7 @@ def step(self): WEIGHT_DECAY = 0.2 # cautious weight decay for Muon ADAM_BETAS = (0.8, 0.95) # Adam beta1, beta2 WARMUP_RATIO = 0.0 # fraction of time budget for LR warmup -WARMDOWN_RATIO = 0.5 # fraction of time budget for LR warmdown +WARMDOWN_RATIO = 0.25 # fraction of time budget for LR warmdown (was 0.5) FINAL_LR_FRAC = 0.0 # final LR as fraction of initial # Model size From 60bc730164b6b3708c54853a5a6c095f1ee747ce Mon Sep 17 00:00:00 2001 From: Mihail Date: Wed, 15 Apr 2026 13:27:40 +0100 Subject: [PATCH 2/3] Experiment: increase DEPTH from 4 to 6 More layers for better representation capacity while keeping same width. --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 236db6b31a..ac8c81acca 100644 --- a/train.py +++ b/train.py @@ -497,7 +497,7 @@ def step(self): FINAL_LR_FRAC = 0.0 # final LR as fraction of initial # Model size -DEPTH = 4 # number of transformer layers +DEPTH = 6 # number of transformer layers (increased from 4) DEVICE_BATCH_SIZE = 16 # per-device batch size (reduce if OOM) # --------------------------------------------------------------------------- From 05c4e903796b62212d071dbfd9c4094197ac082d Mon Sep 17 00:00:00 2001 From: Mihail Date: Wed, 15 Apr 2026 13:40:04 +0100 Subject: [PATCH 3/3] protect --- program.md | 2 +- results.tsv | 3 +++ run.log | 27 +++++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 results.tsv create mode 100644 run.log diff --git a/program.md b/program.md index 28d37cfc99..dade576839 100644 --- a/program.md +++ b/program.md @@ -99,7 +99,7 @@ LOOP FOREVER: 4. Run the experiment: `uv run train.py > run.log 2>&1` (redirect everything — do NOT use tee or let output flood your context) 5. Read out the results: `grep "^val_bpb:\|^peak_vram_mb:" run.log` 6. If the grep output is empty, the run crashed. Run `tail -n 50 run.log` to read the Python stack trace and attempt a fix. If you can't get things to work after more than a few attempts, give up. -7. Record the results in the tsv +7. Always must Record the results in the tsv. Never delete results.tsv! 8. If val_bpb improved (lower), you "advance" the branch, keeping the git commit 9. If val_bpb is equal or worse, you git reset back to where you started diff --git a/results.tsv b/results.tsv new file mode 100644 index 0000000000..41078f43a5 --- /dev/null +++ b/results.tsv @@ -0,0 +1,3 @@ +commit val_bpb memory_gb status description +537c6e6 1.834839 0.0 keep baseline +35e3dec 1.731247 0.0 keep reduce WARMDOWN_RATIO to 0.25 diff --git a/run.log b/run.log new file mode 100644 index 0000000000..0a1910ce3d --- /dev/null +++ b/run.log @@ -0,0 +1,27 @@ +Environment verified: macOS detected with Metal (MPS) hardware acceleration available. + +Environment verified: macOS detected with Metal (MPS) hardware acceleration available. + +Vocab size: 8,192 +Model config: {'sequence_len': 2048, 'vocab_size': 8192, 'n_layer': 6, 'n_head': 3, 'n_kv_head': 3, 'n_embd': 384, 'window_pattern': 'L'} +Parameter counts: + wte : 3,145,728 + value_embeds : 9,437,184 + lm_head : 3,145,728 + transformer_matrices : 10,617,120 + scalars : 12 + total : 26,345,772 +Estimated FLOPs per token: 1.392002e+08 +Scaling AdamW LRs by 1/sqrt(384/768) = 1.414214 +Time budget: 300s +Gradient accumulation steps: 2 + step 00000 (0.0%) | loss: 9.011027 | lrm: 1.00 | dt: 28217ms | tok/sec: 2,322 | mfu: 0.0% | epoch: 1 | remaining: 300s step 00001 (0.0%) | loss: 9.115025 | lrm: 1.00 | dt: 23368ms | tok/sec: 2,804 | mfu: 0.0% | epoch: 1 | remaining: 300s step 00002 (0.0%) | loss: 8.944727 | lrm: 1.00 | dt: 40823ms | tok/sec: 1,605 | mfu: 0.0% | epoch: 1 | remaining: 300s step 00003 (0.0%) | loss: 8.714776 | lrm: 1.00 | dt: 73912ms | tok/sec: 886 | mfu: 0.0% | epoch: 1 | remaining: 300s step 00004 (0.0%) | loss: 8.392169 | lrm: 1.00 | dt: 41292ms | tok/sec: 1,587 | mfu: 0.0% | epoch: 1 | remaining: 300s step 00005 (0.0%) | loss: 8.095791 | lrm: 1.00 | dt: 96242ms | tok/sec: 680 | mfu: 0.0% | epoch: 1 | remaining: 300s step 00006 (0.0%) | loss: 7.847162 | lrm: 1.00 | dt: 81709ms | tok/sec: 802 | mfu: 0.0% | epoch: 1 | remaining: 300s step 00007 (0.0%) | loss: 7.631296 | lrm: 1.00 | dt: 77427ms | tok/sec: 846 | mfu: 0.0% | epoch: 1 | remaining: 300s step 00008 (0.0%) | loss: 7.439391 | lrm: 1.00 | dt: 88897ms | tok/sec: 737 | mfu: 0.0% | epoch: 1 | remaining: 300s Traceback (most recent call last): + File "/Users/misha/Python GENERAL/autoresearchMT/train.py", line 622, in + loss.backward() + File "/Users/misha/Python GENERAL/autoresearchMT/.venv/lib/python3.10/site-packages/torch/_tensor.py", line 626, in backward + torch.autograd.backward( + File "/Users/misha/Python GENERAL/autoresearchMT/.venv/lib/python3.10/site-packages/torch/autograd/__init__.py", line 347, in backward + _engine_run_backward( + File "/Users/misha/Python GENERAL/autoresearchMT/.venv/lib/python3.10/site-packages/torch/autograd/graph.py", line 823, in _engine_run_backward + return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +RuntimeError: MPS backend out of memory (MPS allocated: 16.30 GB, other allocations: 1.76 GB, max allowed: 18.13 GB). Tried to allocate 1024.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).