full train fixes (instructlab#2382)

mergify[bot] · web-flow · commit d4b66d2843c1 · 2024-10-03T19:22:37.000Z
- change LR to 2e-5 (learns better) - honor the gradient accumulation given by the multipack sampler the LR is the big change here. The previous version of full train has little to no improvement after 8 epochs. This version knows about the new information after as little as 1 epoch. This is a major improvement. Also add logging to print information about the data **Checklist:** - [ ] **Commit Message Formatting**: Commit titles and messages follow guidelines in the [conventional commits](https://www.conventionalcommits.org/en/v1.0.0/#summary). - [ ] [Changelog](https://github.com/instructlab/instructlab/blob/main/CHANGELOG.md) updated with breaking and/or notable changes for the next minor release. - [ ] Documentation has been updated, if necessary. - [ ] Unit tests have been added, if necessary. - [ ] Functional tests have been added, if necessary. - [ ] E2E Workflow tests have been added, if necessary. Approved-by: jaideepr97 Approved-by: RobotSail
diff --git a/src/instructlab/model/full_train.py b/src/instructlab/model/full_train.py
@@ -64,7 +64,7 @@ def train(train_args, device):
     )
 
     # based on the length of the dataset, figure out the max batch len
-    packing_max_batch_len, _ = (
+    packing_max_batch_len, accum = (
         multipack_sampler.find_packing_max_batch_len_and_grad_accum(
             num_gpus=1,
             avg_sample_len=dataset.get_lengths().mean(),
@@ -99,6 +99,9 @@ def train(train_args, device):
     )
     dataloader.multiprocessing_context = "spawn"
 
+    logger.info(
+        f"avg_sample_len: {dataset.get_lengths().mean()}\n effective_batch_size: {train_args.effective_batch_size}\n max_batch_len: {train_args.max_batch_len}\n packing_max_batch_len: {packing_max_batch_len} \n grad_accum: {accum}\n  num_batches: {len(dataloader)}\n avg_samples_per_batch: {len(dataset) / len(dataloader)}"
+    )
     # set device based off argument given
     dev = torch.device(device)
     # auto model based on model path
@@ -151,20 +154,18 @@ def train(train_args, device):
     total_ram = memory_info.total / (1024**3)  # Convert to GB
     logger.info(f"Total RAM: {total_ram:.2f} GB")
     # if RAM is <= 16, we need to use fp16 not fp32. This will yield a worse model but will allow the full pipeline to run
-    accum = 1
     if total_ram <= 16:
         # if <= 16GB ram, use gradinent accum and hald precision
         logger.warning(
             f"Your system has {total_ram:.2f} GB of RAM. This is below our reccomendation of 32GB for this type of training. Using half precision."
         )
         model = model.to(dev).half()  # Convert model to float16
-        accum = 4
     else:
         model = model.to(dev)
 
     # adafactor and gradient checkpointing are memory friendly, we opt to use these in the CPU/MPS loop to fit 7b models.
     optimizer = Adafactor(
-        model.parameters(), lr=1e-5, scale_parameter=True, relative_step=False
+        model.parameters(), lr=2e-5, scale_parameter=False, relative_step=False
     )
     model.gradient_checkpointing_enable()