From 945e218c4470c8b626bca81be99b4e7b357b1b34 Mon Sep 17 00:00:00 2001 From: Joel Lidin Date: Sun, 18 Jan 2026 03:57:49 +0400 Subject: [PATCH 1/4] (neurons) Skip batches with all masked labels Add guards in evaluate_model and inner_steps to prevent NaN loss when all labels in a batch are masked (-100). This occurs when batches contain only padding or special tokens. - Check valid_labels count before forward pass - Log warning and skip batch if valid_labels == 0 - Clean up tensors before continuing to next batch - Prevent cross_entropy from receiving empty loss target --- neurons/trainer.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/neurons/trainer.py b/neurons/trainer.py index 0e2d27a3..089487cc 100644 --- a/neurons/trainer.py +++ b/neurons/trainer.py @@ -456,6 +456,18 @@ async def evaluate_model( labels == self.tokenizer.pad_token_id, -100, labels ) + # Skip batch if all labels are masked (would cause NaN in cross_entropy) + valid_labels = (labels != -100).sum().item() + if valid_labels == 0: + tplr.log_with_context( + level="warning", + message=f"Batch {i} has all labels masked (-100), skipping to avoid NaN", + sync_window=self.sync_window, + current_window=self.current_window, + ) + del input_ids, labels + continue + with autocast(device_type=device.type, dtype=torch.bfloat16): logits = model(input_ids) @@ -839,6 +851,15 @@ async def inner_steps( labels == self.tokenizer.pad_token_id, -100, labels ) + # Skip batch if all labels are masked (would cause NaN in cross_entropy) + valid_labels = (labels != -100).sum().item() + if valid_labels == 0: + tplr.logger.warning( + f"Batch {batch_count} has all labels masked (-100), skipping to avoid NaN" + ) + del input_ids, labels + continue + # ------------------------------------------------------------------ # # 3. Forward + backward # ------------------------------------------------------------------ # From 84ecdef15b5170b06662e9ef2505542def38ba28 Mon Sep 17 00:00:00 2001 From: Joel Lidin Date: Sun, 18 Jan 2026 04:02:28 +0400 Subject: [PATCH 2/4] (neurons) Switch anneal mode to shard 5 Update miner and validator to use anneal shard 5 instead of shard 4. Update documentation to reflect the new shard number in rclone migration examples. - Change current_shard from 4 to 5 in miner.py - Change current_shard from 4 to 5 in validator.py - Update docs with anneal_000005.npy examples --- docs/shared_sharded_dataset.md | 6 +++--- neurons/miner.py | 4 ++-- neurons/validator.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/shared_sharded_dataset.md b/docs/shared_sharded_dataset.md index ddb5481f..d2fe70fa 100644 --- a/docs/shared_sharded_dataset.md +++ b/docs/shared_sharded_dataset.md @@ -148,9 +148,9 @@ rclone copy r2-source:mixed-dataset-migration/anneal/ r2-dest: ##### Copy specific shards (Partial Migration for Testing) If you want to test with the current anneal shard: ```bash -# Copy anneal shard 4 and its sample IDs -rclone copy r2-source:mixed-dataset-migration/anneal/anneal_000004.npy r2-dest:/anneal/ --progress -rclone copy r2-source:mixed-dataset-migration/anneal/sample_ids_anneal_000004.npy r2-dest:/anneal/ --progress +# Copy anneal shard 5 and its sample IDs +rclone copy r2-source:mixed-dataset-migration/anneal/anneal_000005.npy r2-dest:/anneal/ --progress +rclone copy r2-source:mixed-dataset-migration/anneal/sample_ids_anneal_000005.npy r2-dest:/anneal/ --progress ``` After migration, update your environment variables to point to your bucket: diff --git a/neurons/miner.py b/neurons/miner.py index e1d5e1f6..4ebc9077 100644 --- a/neurons/miner.py +++ b/neurons/miner.py @@ -434,9 +434,9 @@ async def run(self): self.outer_steps_per_shard, self.shard_reset_outer_step, ) - # In anneal mode, always use shard 4 + # In anneal mode, always use shard 5 if self.dataset_manager.anneal_mode: - current_shard = 4 + current_shard = 5 current_shard_epoch = 0 tplr.logger.info( f"Starting with global_step={self.global_step} (actual outer steps)" diff --git a/neurons/validator.py b/neurons/validator.py index 513c393c..8a407fbc 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -1263,9 +1263,9 @@ async def run(self): self.outer_steps_per_shard, self.shard_reset_outer_step, ) - # In anneal mode, always use shard 4 + # In anneal mode, always use shard 5 if self.dataset_manager.anneal_mode: - current_shard = 4 + current_shard = 5 shard_epoch = 0 # Initialize datasets (only rank 0 downloads, handled internally by dataset_manager) From e66da24fa100a3e24a9b3b6ac1fbb4296d2f1a8f Mon Sep 17 00:00:00 2001 From: Joel Lidin Date: Sun, 18 Jan 2026 04:02:59 +0400 Subject: [PATCH 3/4] Bump run version --- src/tplr/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tplr/__init__.py b/src/tplr/__init__.py index 34924280..265222a5 100644 --- a/src/tplr/__init__.py +++ b/src/tplr/__init__.py @@ -20,7 +20,7 @@ # mypy: ignore-errors # type: ignore -__version__ = "2.1.26" +__version__ = "2.1.27" # Import package. from .chain import * From ea3ec3dbddcbf89abcc3bcc00a1bc8955171f22d Mon Sep 17 00:00:00 2001 From: Joel Lidin Date: Sun, 18 Jan 2026 17:55:06 +0400 Subject: [PATCH 4/4] (hparams) Update anneal scheduler Change from 120 to 150 to mitigate for now alwyas gathering the full 20 peers. --- hparams/hparams.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hparams/hparams.json b/hparams/hparams.json index 8bf3cbba..bb7023e1 100644 --- a/hparams/hparams.json +++ b/hparams/hparams.json @@ -4,7 +4,7 @@ "enabled": true, "start_global_step": 6100, "warmup_inner_steps": 100, - "decay_outer_steps": 120, + "decay_outer_steps": 150, "peak_lr_factor": 0.25, "eta_min_factor": 0.0, "file_prefix": "anneal"