introduce --pipeline

cdoern · cdoern · commit 3897ff28e42f · 2024-09-26T12:52:56.000-04:00
--pipeline has 3 options: simple, full, and accelerated. simple will run either MLX train or Linux_train. Full will run the CPU/MPS optimized version of full fine tuning. Accelerated will shell out to the library code for larger GPU support
this conforms well with SDG --pipeline, it also allows us to maintaim the SFTTrainer and MLX while also supporting our own training loop

Signed-off-by: Charlie Doern &lt;cdoern@redhat.com&gt;
diff --git a/src/instructlab/model/train.py b/src/instructlab/model/train.py
@@ -281,11 +281,6 @@ def clickpath_setup(is_dir: bool) -> click.Path:
     config_sections=ADDITIONAL_ARGUMENTS,
     required=True,  # default from config
 )
-@click.option(
-    "--legacy",
-    is_flag=True,
-    help="if true, enables the legacy linux training code path from release 0.17.0 and prior.",
-)
 @click.option(
     "--strategy",
     type=click.Choice(
@@ -361,6 +356,14 @@ def clickpath_setup(is_dir: bool) -> click.Path:
     is_flag=True,
     help="By default, checkpoints are saved at the end of each training epoch. This option disables this behavior.",
 )
+@click.option(
+    "--pipeline",
+    type=click.Choice(["simple", "full", "accelerated"]),
+    default="accelerated",
+    help="Model fidelity pipeline for training: 'simple' uses SFTTrainer on Linux or MLX on MacOS, producing low-fidelity models quickly for rapid prototyping."
+    "'full' employs CPU and MPS optimized InstructLab fine-tuning, generating medium-fidelity models over a longer period."
+    "'accelerated' utilizes GPU acceleration and distributed training, yielding high-fidelity models but requiring more time. Choose based on your hardware, time constraints, and desired model quality",
+)
 @click.pass_context
 @clickext.display_params
 def train(
@@ -377,7 +380,6 @@ def train(
     num_epochs,
     device: str,
     four_bit_quant: bool,
-    legacy,
     strategy: str | None,
     phased_base_dir: pathlib.Path,
     phased_phase1_data: pathlib.Path | None,
@@ -391,13 +393,21 @@ def train(
     phased_mt_bench_judge: pathlib.Path | None,
     skip_user_confirm: bool,
     enable_serving_output: bool,
+    pipeline: str,
     **kwargs,
 ):
     """
     Takes synthetic data generated locally with `ilab data generate` and the previous model and learns a new model using the MLX API.
     On success, writes newly learned model to {model_dir}/mlx_model, which is where `chatmlx` will look for a model.
     """
     torch.set_autocast_enabled(False)
+
+    if (
+        pipeline in ("full", "simple")
+        and strategy == SupportedTrainingStrategies.LAB_MULTIPHASE.value
+    ):
+        ctx.fail("Multi Phase training is only supported with `--pipeline accelerated`")
+
     if not input_dir:
         # By default, generate output-dir is used as train input-dir
         input_dir = ctx.obj.config.generate.output_dir
@@ -537,12 +547,153 @@ def get_files(directory: str, pattern: str) -> list[str]:
             )
 
         # we can use train_args locally to run lower fidelity training
-        if is_high_fidelity(device):
+        if is_high_fidelity(device) or pipeline == "accelerated":
             run_training(train_args=train_args, torch_args=torch_args, device=device)
-        else:
+        elif not is_high_fidelity(device) or pipeline == "full":
+            # if on CPU or MPS, execute full train, which is based
+            # off of the structure of the training repo, just with different optimizers, model sizes, and special data gradient accumulation to get it
+            # to fit on most consumer laptops
             full_train.train(train_args, device)
-
-
+        elif pipeline == "simple":
+            if utils.is_macos_with_m_chip() and not strategy:
+                # Local
+                from ..mlx_explore.gguf_convert_to_mlx import load
+                from ..mlx_explore.utils import fetch_tokenizer_from_hub
+                from ..train.lora_mlx.convert import convert_between_mlx_and_pytorch
+                from ..train.lora_mlx.lora import load_and_train
+                from ..train.lora_mlx.make_data import make_data
+
+                if not skip_preprocessing:
+                    try:
+                        make_data(data_dir=data_path)
+                    except FileNotFoundError as exc:
+                        click.secho(
+                            f"Could not read from data directory: {exc}",
+                            fg="red",
+                        )
+                        raise click.exceptions.Exit(1)
+
+                # NOTE we can skip this if we have a way to ship MLX
+                # PyTorch safetensors to MLX safetensors
+                model_dir_local = model_path.replace("/", "-")
+                model_dir_local = f"{ckpt_output_dir}/{model_dir_local}"
+                model_dir_mlx = f"{model_dir_local}-mlx"
+                model_dir_mlx_quantized = f"{model_dir_local}-mlx-q"
+
+                if skip_quantize:
+                    dest_model_dir = model_dir_mlx
+                    quantize_arg = False
+                else:
+                    dest_model_dir = model_dir_mlx_quantized
+                    quantize_arg = True
+
+                if tokenizer_dir is not None and gguf_model_path is not None:
+                    if not local:
+                        tokenizer_dir_local = tokenizer_dir.replace("/", "-")
+                        fetch_tokenizer_from_hub(tokenizer_dir, tokenizer_dir_local)
+
+                    # no need to pass quantize_arg for now, script automatically detects if quantization is necessary based on whether gguf model is quantized or not
+                    load(
+                        gguf=gguf_model_path,
+                        repo=tokenizer_dir,
+                        mlx_path=dest_model_dir,
+                    )
+
+                    for filename in os.listdir(model_dir_local):
+                        shutil.copy(
+                            os.path.join(model_dir_local, filename),
+                            os.path.join(dest_model_dir, filename),
+                        )
+                    shutil.rmtree(model_dir_local, ignore_errors=True)
+
+                else:
+                    # Downloading PyTorch SafeTensor and Converting to MLX SafeTensor
+                    convert_between_mlx_and_pytorch(
+                        hf_path=model_path,
+                        mlx_path=dest_model_dir,
+                        quantize=quantize_arg,
+                        local=local,
+                    )
+
+                adapter_file_path = f"{dest_model_dir}/adapters.npz"
+
+                # train the model with LoRA
+                load_and_train(
+                    model=dest_model_dir,
+                    train=True,
+                    data=data_path,
+                    adapter_file=adapter_file_path,
+                    iters=iters,
+                    save_every=10,
+                    steps_per_eval=10,
+                )
+            else:
+                # Local
+                from ..llamacpp.llamacpp_convert_to_gguf import convert_llama_to_gguf
+                from ..train.linux_train import linux_train
+
+                training_results_dir = linux_train(
+                    ctx=ctx,
+                    train_file=train_file,
+                    test_file=test_file,
+                    model_name=model_path,
+                    num_epochs=num_epochs,
+                    train_device=device,
+                    four_bit_quant=four_bit_quant,
+                )
+
+                final_results_dir = training_results_dir / "final"
+                if final_results_dir.exists():
+                    shutil.rmtree(final_results_dir)
+                final_results_dir.mkdir()
+
+                gguf_models_dir = Path(DEFAULTS.CHECKPOINTS_DIR)
+                gguf_models_dir.mkdir(exist_ok=True)
+                gguf_models_file = gguf_models_dir / "ggml-model-f16.gguf"
+
+                # Remove previously trained model, its taking up space we may need in the next step
+                gguf_models_file.unlink(missing_ok=True)
+
+                # TODO: Figure out what to do when there are multiple checkpoint dirs.
+                # Right now it's just copying files from the first one numerically not necessarily the best one
+                for fpath in (
+                    "checkpoint-*/added_tokens.json",
+                    "checkpoint-*/special_tokens_map.json",
+                    "checkpoint-*/tokenizer.json",
+                    "checkpoint-*/tokenizer.model",
+                    "checkpoint-*/tokenizer_config.json",
+                    "merged_model/config.json",
+                    "merged_model/generation_config.json",
+                ):
+                    file_ = next(training_results_dir.glob(fpath))
+                    shutil.copy(file_, final_results_dir)
+                    logger.info(f"Copied {file_} to {final_results_dir}")
+
+                for file in training_results_dir.glob("merged_model/*.safetensors"):
+                    shutil.move(file, final_results_dir)
+                    logger.info(f"Moved {file} to {final_results_dir}")
+
+                if four_bit_quant:
+                    logger.info(
+                        "SKIPPING CONVERSION to gguf. This is unsupported with --4-bit-quant. "
+                        + "See https://github.com/instructlab/instructlab/issues/579."
+                    )
+                    return
+
+                gguf_file_path = convert_llama_to_gguf(
+                    model=final_results_dir, pad_vocab=True
+                )
+
+                # Remove safetensors files to save space, were done with them here
+                # and the huggingface lib has them cached
+                for file in final_results_dir.glob("*.safetensors"):
+                    file.unlink()
+
+                shutil.move(gguf_file_path, gguf_models_file)
+                logger.info(f"Save trained model to {gguf_models_file}")
+
+
+# chooses which type of training to run depending on the device provided
 def is_high_fidelity(device):
     return device == "cuda" or device == "hpu"