From 945e218c4470c8b626bca81be99b4e7b357b1b34 Mon Sep 17 00:00:00 2001
From: Joel Lidin <joellidin@gmail.com>
Date: Sun, 18 Jan 2026 03:57:49 +0400
Subject: [PATCH 1/4] (neurons) Skip batches with all masked labels

Add guards in evaluate_model and inner_steps to prevent NaN loss when
all labels in a batch are masked (-100). This occurs when batches
contain only padding or special tokens.

- Check valid_labels count before forward pass
- Log warning and skip batch if valid_labels == 0
- Clean up tensors before continuing to next batch
- Prevent cross_entropy from receiving empty loss target
---
 neurons/trainer.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
diff --git a/neurons/trainer.py b/neurons/trainer.py
index 0e2d27a3..089487cc 100644
--- a/neurons/trainer.py
+++ b/neurons/trainer.py
@@ -456,6 +456,18 @@ async def evaluate_model(
                     labels == self.tokenizer.pad_token_id, -100, labels
                 )
 
+                # Skip batch if all labels are masked (would cause NaN in cross_entropy)
+                valid_labels = (labels != -100).sum().item()
+                if valid_labels == 0:
+                    tplr.log_with_context(
+                        level="warning",
+                        message=f"Batch {i} has all labels masked (-100), skipping to avoid NaN",
+                        sync_window=self.sync_window,
+                        current_window=self.current_window,
+                    )
+                    del input_ids, labels
+                    continue
+
                 with autocast(device_type=device.type, dtype=torch.bfloat16):
                     logits = model(input_ids)
 
@@ -839,6 +851,15 @@ async def inner_steps(
                         labels == self.tokenizer.pad_token_id, -100, labels
                     )
 
+                    # Skip batch if all labels are masked (would cause NaN in cross_entropy)
+                    valid_labels = (labels != -100).sum().item()
+                    if valid_labels == 0:
+                        tplr.logger.warning(
+                            f"Batch {batch_count} has all labels masked (-100), skipping to avoid NaN"
+                        )
+                        del input_ids, labels
+                        continue
+
                 # ------------------------------------------------------------------ #
                 # 3. Forward + backward
                 # ------------------------------------------------------------------ #

From 84ecdef15b5170b06662e9ef2505542def38ba28 Mon Sep 17 00:00:00 2001
From: Joel Lidin <joellidin@gmail.com>
Date: Sun, 18 Jan 2026 04:02:28 +0400
Subject: [PATCH 2/4] (neurons) Switch anneal mode to shard 5

Update miner and validator to use anneal shard 5 instead of shard 4.
Update documentation to reflect the new shard number in rclone migration
examples.

- Change current_shard from 4 to 5 in miner.py
- Change current_shard from 4 to 5 in validator.py
- Update docs with anneal_000005.npy examples
---
 docs/shared_sharded_dataset.md | 6 +++---
 neurons/miner.py               | 4 ++--
 neurons/validator.py           | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/shared_sharded_dataset.md b/docs/shared_sharded_dataset.md
index ddb5481f..d2fe70fa 100644
--- a/docs/shared_sharded_dataset.md
+++ b/docs/shared_sharded_dataset.md
@@ -148,9 +148,9 @@ rclone copy r2-source:mixed-dataset-migration/anneal/ r2-dest:<your-bucket-name>
 ##### Copy specific shards (Partial Migration for Testing)
 If you want to test with the current anneal shard:
 ```bash
-# Copy anneal shard 4 and its sample IDs
-rclone copy r2-source:mixed-dataset-migration/anneal/anneal_000004.npy r2-dest:<your-bucket-name>/anneal/ --progress
-rclone copy r2-source:mixed-dataset-migration/anneal/sample_ids_anneal_000004.npy r2-dest:<your-bucket-name>/anneal/ --progress
+# Copy anneal shard 5 and its sample IDs
+rclone copy r2-source:mixed-dataset-migration/anneal/anneal_000005.npy r2-dest:<your-bucket-name>/anneal/ --progress
+rclone copy r2-source:mixed-dataset-migration/anneal/sample_ids_anneal_000005.npy r2-dest:<your-bucket-name>/anneal/ --progress
 ```
 
 After migration, update your environment variables to point to your bucket:
diff --git a/neurons/miner.py b/neurons/miner.py
index e1d5e1f6..4ebc9077 100644
--- a/neurons/miner.py
+++ b/neurons/miner.py
@@ -434,9 +434,9 @@ async def run(self):
             self.outer_steps_per_shard,
             self.shard_reset_outer_step,
         )
-        # In anneal mode, always use shard 4
+        # In anneal mode, always use shard 5
         if self.dataset_manager.anneal_mode:
-            current_shard = 4
+            current_shard = 5
             current_shard_epoch = 0
         tplr.logger.info(
             f"Starting with global_step={self.global_step} (actual outer steps)"
diff --git a/neurons/validator.py b/neurons/validator.py
index 513c393c..8a407fbc 100644
--- a/neurons/validator.py
+++ b/neurons/validator.py
@@ -1263,9 +1263,9 @@ async def run(self):
             self.outer_steps_per_shard,
             self.shard_reset_outer_step,
         )
-        # In anneal mode, always use shard 4
+        # In anneal mode, always use shard 5
         if self.dataset_manager.anneal_mode:
-            current_shard = 4
+            current_shard = 5
             shard_epoch = 0
 
         # Initialize datasets (only rank 0 downloads, handled internally by dataset_manager)

From e66da24fa100a3e24a9b3b6ac1fbb4296d2f1a8f Mon Sep 17 00:00:00 2001
From: Joel Lidin <joellidin@gmail.com>
Date: Sun, 18 Jan 2026 04:02:59 +0400
Subject: [PATCH 3/4] Bump run version

---
 src/tplr/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tplr/__init__.py b/src/tplr/__init__.py
index 34924280..265222a5 100644
--- a/src/tplr/__init__.py
+++ b/src/tplr/__init__.py
@@ -20,7 +20,7 @@
 # mypy: ignore-errors
 # type: ignore
 
-__version__ = "2.1.26"
+__version__ = "2.1.27"
 
 # Import package.
 from .chain import *

From ea3ec3dbddcbf89abcc3bcc00a1bc8955171f22d Mon Sep 17 00:00:00 2001
From: Joel Lidin <joellidin@gmail.com>
Date: Sun, 18 Jan 2026 17:55:06 +0400
Subject: [PATCH 4/4] (hparams) Update anneal scheduler

Change from 120 to 150 to mitigate for now alwyas gathering the full 20
peers.
---
 hparams/hparams.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hparams/hparams.json b/hparams/hparams.json
index 8bf3cbba..bb7023e1 100644
--- a/hparams/hparams.json
+++ b/hparams/hparams.json
@@ -4,7 +4,7 @@
       "enabled": true,
       "start_global_step": 6100,
       "warmup_inner_steps": 100,
-      "decay_outer_steps": 120,
+      "decay_outer_steps": 150,
       "peak_lr_factor": 0.25,
       "eta_min_factor": 0.0,
       "file_prefix": "anneal"