diff --git a/docs/shared_sharded_dataset.md b/docs/shared_sharded_dataset.md index ddb5481f..d2fe70fa 100644 --- a/docs/shared_sharded_dataset.md +++ b/docs/shared_sharded_dataset.md @@ -148,9 +148,9 @@ rclone copy r2-source:mixed-dataset-migration/anneal/ r2-dest: ##### Copy specific shards (Partial Migration for Testing) If you want to test with the current anneal shard: ```bash -# Copy anneal shard 4 and its sample IDs -rclone copy r2-source:mixed-dataset-migration/anneal/anneal_000004.npy r2-dest:/anneal/ --progress -rclone copy r2-source:mixed-dataset-migration/anneal/sample_ids_anneal_000004.npy r2-dest:/anneal/ --progress +# Copy anneal shard 5 and its sample IDs +rclone copy r2-source:mixed-dataset-migration/anneal/anneal_000005.npy r2-dest:/anneal/ --progress +rclone copy r2-source:mixed-dataset-migration/anneal/sample_ids_anneal_000005.npy r2-dest:/anneal/ --progress ``` After migration, update your environment variables to point to your bucket: diff --git a/hparams/hparams.json b/hparams/hparams.json index 8bf3cbba..bb7023e1 100644 --- a/hparams/hparams.json +++ b/hparams/hparams.json @@ -4,7 +4,7 @@ "enabled": true, "start_global_step": 6100, "warmup_inner_steps": 100, - "decay_outer_steps": 120, + "decay_outer_steps": 150, "peak_lr_factor": 0.25, "eta_min_factor": 0.0, "file_prefix": "anneal" diff --git a/neurons/miner.py b/neurons/miner.py index e1d5e1f6..4ebc9077 100644 --- a/neurons/miner.py +++ b/neurons/miner.py @@ -434,9 +434,9 @@ async def run(self): self.outer_steps_per_shard, self.shard_reset_outer_step, ) - # In anneal mode, always use shard 4 + # In anneal mode, always use shard 5 if self.dataset_manager.anneal_mode: - current_shard = 4 + current_shard = 5 current_shard_epoch = 0 tplr.logger.info( f"Starting with global_step={self.global_step} (actual outer steps)" diff --git a/neurons/trainer.py b/neurons/trainer.py index 0e2d27a3..089487cc 100644 --- a/neurons/trainer.py +++ b/neurons/trainer.py @@ -456,6 +456,18 @@ async def evaluate_model( labels == self.tokenizer.pad_token_id, -100, labels ) + # Skip batch if all labels are masked (would cause NaN in cross_entropy) + valid_labels = (labels != -100).sum().item() + if valid_labels == 0: + tplr.log_with_context( + level="warning", + message=f"Batch {i} has all labels masked (-100), skipping to avoid NaN", + sync_window=self.sync_window, + current_window=self.current_window, + ) + del input_ids, labels + continue + with autocast(device_type=device.type, dtype=torch.bfloat16): logits = model(input_ids) @@ -839,6 +851,15 @@ async def inner_steps( labels == self.tokenizer.pad_token_id, -100, labels ) + # Skip batch if all labels are masked (would cause NaN in cross_entropy) + valid_labels = (labels != -100).sum().item() + if valid_labels == 0: + tplr.logger.warning( + f"Batch {batch_count} has all labels masked (-100), skipping to avoid NaN" + ) + del input_ids, labels + continue + # ------------------------------------------------------------------ # # 3. Forward + backward # ------------------------------------------------------------------ # diff --git a/neurons/validator.py b/neurons/validator.py index 513c393c..8a407fbc 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -1263,9 +1263,9 @@ async def run(self): self.outer_steps_per_shard, self.shard_reset_outer_step, ) - # In anneal mode, always use shard 4 + # In anneal mode, always use shard 5 if self.dataset_manager.anneal_mode: - current_shard = 4 + current_shard = 5 shard_epoch = 0 # Initialize datasets (only rank 0 downloads, handled internally by dataset_manager) diff --git a/src/tplr/__init__.py b/src/tplr/__init__.py index 34924280..265222a5 100644 --- a/src/tplr/__init__.py +++ b/src/tplr/__init__.py @@ -20,7 +20,7 @@ # mypy: ignore-errors # type: ignore -__version__ = "2.1.26" +__version__ = "2.1.27" # Import package. from .chain import *