ENG 12851: Add warmup ratio and update AsyncFineTuning (#202)

azahed98 · Arsh Zahed · orangetin · web-flow · commit bfc43de398b5 · 2024-10-15T15:25:09.000-07:00
* Add warmup ratio and update async

* Add warmup_ratio to args

---------

Co-authored-by: Arsh Zahed &lt;arshzahed@Arshs-MacBook-Pro.local&gt;
Co-authored-by: orangetin &lt;abhy@together.ai&gt;
diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py
@@ -60,6 +60,12 @@ def fine_tuning(ctx: click.Context) -> None:
 )
 @click.option("--batch-size", type=INT_WITH_MAX, default="max", help="Train batch size")
 @click.option("--learning-rate", type=float, default=1e-5, help="Learning rate")
+@click.option(
+    "--warmup-ratio",
+    type=float,
+    default=0.0,
+    help="Warmup ratio for learning rate scheduler.",
+)
 @click.option(
     "--lora/--no-lora",
     type=bool,
@@ -97,6 +103,7 @@ def create(
     n_checkpoints: int,
     batch_size: int | Literal["max"],
     learning_rate: float,
+    warmup_ratio: float,
     lora: bool,
     lora_r: int,
     lora_dropout: float,
@@ -118,6 +125,7 @@ def create(
         n_checkpoints=n_checkpoints,
         batch_size=batch_size,
         learning_rate=learning_rate,
+        warmup_ratio=warmup_ratio,
         lora=lora,
         lora_r=lora_r,
         lora_dropout=lora_dropout,
@@ -186,6 +194,7 @@ def create(
             n_checkpoints=n_checkpoints,
             batch_size=batch_size,
             learning_rate=learning_rate,
+            warmup_ratio=warmup_ratio,
             lora=lora,
             lora_r=lora_r,
             lora_dropout=lora_dropout,
diff --git a/src/together/cli/api/utils.py b/src/together/cli/api/utils.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import click
 
 from typing import Literal
diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py
@@ -25,6 +25,81 @@
 from together.utils import log_warn_once, normalize_key
 
 
+def createFinetuneRequest(
+    model_limits: FinetuneTrainingLimits,
+    training_file: str,
+    model: str,
+    n_epochs: int = 1,
+    validation_file: str | None = "",
+    n_evals: int | None = 0,
+    n_checkpoints: int | None = 1,
+    batch_size: int | Literal["max"] = "max",
+    learning_rate: float | None = 0.00001,
+    warmup_ratio: float | None = 0.0,
+    lora: bool = False,
+    lora_r: int | None = None,
+    lora_dropout: float | None = 0,
+    lora_alpha: float | None = None,
+    lora_trainable_modules: str | None = "all-linear",
+    suffix: str | None = None,
+    wandb_api_key: str | None = None,
+) -> FinetuneRequest:
+    if batch_size == "max":
+        log_warn_once(
+            "Starting from together>=1.3.0, "
+            "the default batch size is set to the maximum allowed value for each model."
+        )
+    if warmup_ratio is None:
+        warmup_ratio = 0.0
+
+    training_type: TrainingType = FullTrainingType()
+    if lora:
+        if model_limits.lora_training is None:
+            raise ValueError("LoRA adapters are not supported for the selected model.")
+        lora_r = lora_r if lora_r is not None else model_limits.lora_training.max_rank
+        lora_alpha = lora_alpha if lora_alpha is not None else lora_r * 2
+        training_type = LoRATrainingType(
+            lora_r=lora_r,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            lora_trainable_modules=lora_trainable_modules,
+        )
+
+        batch_size = (
+            batch_size
+            if batch_size != "max"
+            else model_limits.lora_training.max_batch_size
+        )
+    else:
+        if model_limits.full_training is None:
+            raise ValueError("Full training is not supported for the selected model.")
+        batch_size = (
+            batch_size
+            if batch_size != "max"
+            else model_limits.full_training.max_batch_size
+        )
+
+    if warmup_ratio > 1 or warmup_ratio < 0:
+        raise ValueError("Warmup ratio should be between 0 and 1")
+
+    finetune_request = FinetuneRequest(
+        model=model,
+        training_file=training_file,
+        validation_file=validation_file,
+        n_epochs=n_epochs,
+        n_evals=n_evals,
+        n_checkpoints=n_checkpoints,
+        batch_size=batch_size,
+        learning_rate=learning_rate,
+        warmup_ratio=warmup_ratio,
+        training_type=training_type,
+        suffix=suffix,
+        wandb_key=wandb_api_key,
+    )
+
+    return finetune_request
+
+
 class FineTuning:
     def __init__(self, client: TogetherClient) -> None:
         self._client = client
@@ -40,6 +115,7 @@ def create(
         n_checkpoints: int | None = 1,
         batch_size: int | Literal["max"] = "max",
         learning_rate: float | None = 0.00001,
+        warmup_ratio: float | None = 0.0,
         lora: bool = False,
         lora_r: int | None = None,
         lora_dropout: float | None = 0,
@@ -64,6 +140,7 @@ def create(
             batch_size (int, optional): Batch size for fine-tuning. Defaults to max.
             learning_rate (float, optional): Learning rate multiplier to use for training
                 Defaults to 0.00001.
+            warmup_ratio (float, optional): Warmup ratio for learning rate scheduler.
             lora (bool, optional): Whether to use LoRA adapters. Defaults to True.
             lora_r (int, optional): Rank of LoRA adapters. Defaults to 8.
             lora_dropout (float, optional): Dropout rate for LoRA adapters. Defaults to 0.
@@ -82,65 +159,33 @@ def create(
             FinetuneResponse: Object containing information about fine-tuning job.
         """
 
-        if batch_size == "max":
-            log_warn_once(
-                "Starting from together>=1.3.0, "
-                "the default batch size is set to the maximum allowed value for each model."
-            )
-
         requestor = api_requestor.APIRequestor(
             client=self._client,
         )
 
         if model_limits is None:
             model_limits = self.get_model_limits(model=model)
 
-        training_type: TrainingType = FullTrainingType()
-        if lora:
-            if model_limits.lora_training is None:
-                raise ValueError(
-                    "LoRA adapters are not supported for the selected model."
-                )
-            lora_r = (
-                lora_r if lora_r is not None else model_limits.lora_training.max_rank
-            )
-            lora_alpha = lora_alpha if lora_alpha is not None else lora_r * 2
-            training_type = LoRATrainingType(
-                lora_r=lora_r,
-                lora_alpha=lora_alpha,
-                lora_dropout=lora_dropout,
-                lora_trainable_modules=lora_trainable_modules,
-            )
-
-            batch_size = (
-                batch_size
-                if batch_size != "max"
-                else model_limits.lora_training.max_batch_size
-            )
-        else:
-            if model_limits.full_training is None:
-                raise ValueError(
-                    "Full training is not supported for the selected model."
-                )
-            batch_size = (
-                batch_size
-                if batch_size != "max"
-                else model_limits.full_training.max_batch_size
-            )
-
-        finetune_request = FinetuneRequest(
-            model=model,
+        finetune_request = createFinetuneRequest(
+            model_limits=model_limits,
             training_file=training_file,
-            validation_file=validation_file,
+            model=model,
             n_epochs=n_epochs,
+            validation_file=validation_file,
             n_evals=n_evals,
             n_checkpoints=n_checkpoints,
             batch_size=batch_size,
             learning_rate=learning_rate,
-            training_type=training_type,
+            warmup_ratio=warmup_ratio,
+            lora=lora,
+            lora_r=lora_r,
+            lora_dropout=lora_dropout,
+            lora_alpha=lora_alpha,
+            lora_trainable_modules=lora_trainable_modules,
             suffix=suffix,
-            wandb_key=wandb_api_key,
+            wandb_api_key=wandb_api_key,
         )
+
         if verbose:
             rprint(
                 "Submitting a fine-tuning job with the following parameters:",
@@ -377,12 +422,20 @@ async def create(
         model: str,
         n_epochs: int = 1,
         validation_file: str | None = "",
-        n_evals: int = 0,
+        n_evals: int | None = 0,
         n_checkpoints: int | None = 1,
-        batch_size: int | None = 32,
-        learning_rate: float = 0.00001,
+        batch_size: int | Literal["max"] = "max",
+        learning_rate: float | None = 0.00001,
+        warmup_ratio: float | None = 0.0,
+        lora: bool = False,
+        lora_r: int | None = None,
+        lora_dropout: float | None = 0,
+        lora_alpha: float | None = None,
+        lora_trainable_modules: str | None = "all-linear",
         suffix: str | None = None,
         wandb_api_key: str | None = None,
+        verbose: bool = False,
+        model_limits: FinetuneTrainingLimits | None = None,
     ) -> FinetuneResponse:
         """
         Async method to initiate a fine-tuning job
@@ -395,13 +448,23 @@ async def create(
             n_evals (int, optional): Number of evaluation loops to run. Defaults to 0.
             n_checkpoints (int, optional): Number of checkpoints to save during fine-tuning.
                 Defaults to 1.
-            batch_size (int, optional): Batch size for fine-tuning. Defaults to 32.
+            batch_size (int, optional): Batch size for fine-tuning. Defaults to max.
             learning_rate (float, optional): Learning rate multiplier to use for training
                 Defaults to 0.00001.
+            warmup_ratio (float, optional): Warmup ratio for learning rate scheduler.
+            lora (bool, optional): Whether to use LoRA adapters. Defaults to True.
+            lora_r (int, optional): Rank of LoRA adapters. Defaults to 8.
+            lora_dropout (float, optional): Dropout rate for LoRA adapters. Defaults to 0.
+            lora_alpha (float, optional): Alpha for LoRA adapters. Defaults to 8.
+            lora_trainable_modules (str, optional): Trainable modules for LoRA adapters. Defaults to "all-linear".
             suffix (str, optional): Up to 40 character suffix that will be added to your fine-tuned model name.
                 Defaults to None.
             wandb_api_key (str, optional): API key for Weights & Biases integration.
                 Defaults to None.
+            verbose (bool, optional): whether to print the job parameters before submitting a request.
+                Defaults to False.
+            model_limits (FinetuneTrainingLimits, optional): Limits for the hyperparameters the model in Fine-tuning.
+                Defaults to None.
 
         Returns:
             FinetuneResponse: Object containing information about fine-tuning job.
@@ -411,18 +474,35 @@ async def create(
             client=self._client,
         )
 
-        parameter_payload = FinetuneRequest(
-            model=model,
+        if model_limits is None:
+            model_limits = await self.get_model_limits(model=model)
+
+        finetune_request = createFinetuneRequest(
+            model_limits=model_limits,
             training_file=training_file,
-            validation_file=validation_file,
+            model=model,
             n_epochs=n_epochs,
+            validation_file=validation_file,
             n_evals=n_evals,
             n_checkpoints=n_checkpoints,
             batch_size=batch_size,
             learning_rate=learning_rate,
+            warmup_ratio=warmup_ratio,
+            lora=lora,
+            lora_r=lora_r,
+            lora_dropout=lora_dropout,
+            lora_alpha=lora_alpha,
+            lora_trainable_modules=lora_trainable_modules,
             suffix=suffix,
-            wandb_key=wandb_api_key,
-        ).model_dump(exclude_none=True)
+            wandb_api_key=wandb_api_key,
+        )
+
+        if verbose:
+            rprint(
+                "Submitting a fine-tuning job with the following parameters:",
+                finetune_request,
+            )
+        parameter_payload = finetune_request.model_dump(exclude_none=True)
 
         response, _, _ = await requestor.arequest(
             options=TogetherRequest(
diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py
@@ -150,6 +150,8 @@ class FinetuneRequest(BaseModel):
     n_epochs: int
     # training learning rate
     learning_rate: float
+    # learning rate warmup ratio
+    warmup_ratio: float
     # number of checkpoints to save
     n_checkpoints: int | None = None
     # number of evaluation loops to run
@@ -190,6 +192,8 @@ class FinetuneResponse(BaseModel):
     batch_size: int | None = None
     # training learning rate
     learning_rate: float | None = None
+    # learning rate warmup ratio
+    warmup_ratio: float | None = None
     # number of steps between evals
     eval_steps: int | None = None
     # training type

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+from __future__ import annotations`
	`2`	`+`
`1`	`3`	`import click`
`2`	`4`
`3`	`5`	`from typing import Literal`