Skip to content

Commit f8e916b

Browse files
committed
add testing for new training functionality
the existing e2e tests needed to be adapted to now have the following options q: run simple training using 4-bit-quant s: run pipeline simple training f: run pipeline full training a: run accelerated library training unit tests switched from referencing --legacy to using --pipeline the different training techiques changed ordering in this rewrite meaning some safeguards (that probably always should have been there) need to be put in place to ensure certain checks only happen if we are executing full, accelerated, or multiphase train Signed-off-by: Charlie Doern <[email protected]>
1 parent 3897ff2 commit f8e916b

File tree

9 files changed

+144
-94
lines changed

9 files changed

+144
-94
lines changed

.github/workflows/e2e-nvidia-a10g-x1.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ jobs:
3131
mode: start
3232
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
3333
ec2-image-id: ami-00c51d9c1374eda97
34-
ec2-instance-type: g5.2xlarge
34+
ec2-instance-type: g5.4xlarge
3535
subnet-id: subnet-02d230cffd9385bd4
3636
security-group-id: sg-06300447c4a5fbef3
3737
iam-role-name: instructlab-ci-runner
@@ -159,7 +159,7 @@ jobs:
159159
160160
python3.11 -m pip show nvidia-nccl-cu12
161161
162-
./scripts/basic-workflow-tests.sh -mL
162+
./scripts/basic-workflow-tests.sh -mf
163163
164164
- name: Add comment to PR if the workflow failed
165165
if: failure() && steps.check_pr.outputs.is_pr == 'true'

.github/workflows/e2e-nvidia-a10g-x4.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ jobs:
148148
HF_TOKEN: ${{ secrets.HF_TOKEN }}
149149
run: |
150150
. venv/bin/activate
151-
./scripts/basic-workflow-tests.sh -mevFMT
151+
./scripts/basic-workflow-tests.sh -mevFMa
152152
153153
- name: Add comment to PR if the workflow failed
154154
if: failure() && steps.check_pr.outputs.is_pr == 'true'

.github/workflows/e2e-nvidia-t4-x1.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ jobs:
121121
- name: Run e2e test
122122
run: |
123123
. venv/bin/activate
124-
./scripts/basic-workflow-tests.sh -m
124+
./scripts/basic-workflow-tests.sh -msq
125125
126126
stop-runner:
127127
name: Stop external EC2 runner

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,4 @@ transformers>=4.41.2
3535
trl>=0.9.4
3636
wandb>=0.16.4
3737
xdg-base-dirs>=6.0.1
38+
psutil>=6.0.0

scripts/basic-workflow-tests.sh

+54-41
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@ NUM_INSTRUCTIONS=5
1717
GENERATE_ARGS=("--num-cpus" "$(nproc)" --taxonomy-path='./taxonomy')
1818
DIFF_ARGS=("--taxonomy-path" "./taxonomy")
1919
TRAIN_ARGS=()
20-
LEGACYTRAIN=0
2120
PHASED_TRAINING=0
22-
TRAIN_LIBRARY=0
2321
BACKEND="llama-cpp"
22+
FOUR_BIT_QUANT=0
23+
SIMPLE_TRAIN=0
24+
FULL_TRAIN=0
25+
ACCELERATED_TRAIN=0
2426
HF_TOKEN=${HF_TOKEN:-}
2527
SDG_PIPELINE="simple"
2628
SKIP_TRAIN=${SKIP_TRAIN:-0}
@@ -97,15 +99,14 @@ set_defaults() {
9799
exit 1
98100
fi
99101

100-
if [ "${PHASED_TRAINING}" -eq 1 ] && [ "${TRAIN_LIBRARY}" -eq 0 ]; then
101-
echo "ERROR: You have -P set. It requires -T."
102+
if [ "${PHASED_TRAINING}" -eq 1 ] && [ "${ACCELERATED_TRAIN}" -eq 0 ]; then
103+
echo "ERROR: You have -P set. It requires -a."
102104
exit 1
103105
fi
104106

105107
if [ "$MINIMAL" -eq 1 ]; then
106108
# Minimal settings to run in less time
107109
NUM_INSTRUCTIONS=1
108-
TRAIN_ARGS+=("--num-epochs" "1")
109110
fi
110111
}
111112

@@ -116,14 +117,8 @@ test_smoke() {
116117

117118
test_init() {
118119
task Initializing ilab
119-
120-
if [ "$LEGACYTRAIN" -eq 1 ]; then
121-
# TODO Only cuda for now
122-
step Setting train-profile for GPU accelerated training
123-
ilab config init --non-interactive --train-profile="${SCRIPTDIR}/test-data/train-profile-a10.yaml"
124-
else
125-
ilab config init --non-interactive
126-
fi
120+
121+
ilab config init --non-interactive
127122

128123
step Replace model in config.yaml
129124
if [ "${BACKEND}" == "vllm" ]; then
@@ -283,29 +278,36 @@ test_generate() {
283278
test_train() {
284279
task Train the model
285280

286-
if [ "$TRAIN_LIBRARY" -eq 1 ]; then
287-
local data
288-
data=$(find "${DATA_HOME}"/instructlab/datasets -name 'messages_*' | head -n 1)
281+
local data
282+
data=$(find "${DATA_HOME}"/instructlab/datasets -name 'skills_train_msgs_*' | head -n 1)
283+
284+
# simple, full, and accelerated, are different workflows
285+
# To mimic a real user e2e scenario, only one of these should be run on a given system
286+
# The `small` worker can manage `simple`, The medium worker can handle `full` and the large worker can handle `accelerated`
287+
if [ "$ACCELERATED_TRAIN" -eq 1 ]; then
289288
# TODO Only cuda for now
290289
# the train profile specified in test_init overrides the majority of TRAIN_ARGS, including things like num_epochs. While it looks like much of those settings are being lost, they just have different values here.
291-
TRAIN_ARGS=("--device=cuda" "--model-path=${GRANITE_SAFETENSOR_REPO}" "--data-path=${data}" "--lora-quantize-dtype=nf4" "--4-bit-quant" "--effective-batch-size=4" "--is-padding-free=False")
290+
TRAIN_ARGS=("--pipeline=accelerated" "--device=cuda" "--model-path=${GRANITE_SAFETENSOR_REPO}" "--data-path=${data}" "--lora-quantize-dtype=nf4" "--4-bit-quant" "--effective-batch-size=4" "--is-padding-free=False")
292291
if [ "${BACKEND}" != "vllm" ]; then
293292
TRAIN_ARGS+=("--gguf-model-path" "${CACHE_HOME}/instructlab/models/${GRANITE_GGUF_MODEL}")
294293
fi
295-
296-
ilab model train "${TRAIN_ARGS[@]}"
297-
else
298-
# TODO Only cuda for now
299-
TRAIN_ARGS+=("--legacy" "--device=cuda")
300-
if [ "$LEGACYTRAIN" -eq 0 ]; then
294+
fi
295+
if [ "$SIMPLE_TRAIN" -eq 1 ]; then
296+
if [ "$FOUR_BIT_QUANT" -eq 1 ]; then
301297
TRAIN_ARGS+=("--4-bit-quant")
302298
fi
299+
# TODO Only cuda for now
300+
TRAIN_ARGS+=("--pipeline=simple" "--device=cuda" "--num-epochs=1")
303301
if [ "${BACKEND}" != "vllm" ]; then
304302
TRAIN_ARGS+=("--gguf-model-path" "${CACHE_HOME}/instructlab/models/${GRANITE_GGUF_MODEL}")
305303
fi
306-
307-
ilab model train "${TRAIN_ARGS[@]}"
308304
fi
305+
if [ "$FULL_TRAIN" -eq 1 ]; then
306+
# test training on a CPU not the GPU
307+
TRAIN_ARGS=("--num-epochs=1" "--pipeline=full" "--model-path=${GRANITE_SAFETENSOR_REPO}" "--data-path=${data}" "--effective-batch-size=4" --device=cpu)
308+
fi
309+
310+
ilab model train "${TRAIN_ARGS[@]}"
309311
}
310312

311313
test_phased_train() {
@@ -451,15 +453,18 @@ test_exec() {
451453
# When we run training with --4-bit-quant, we can't convert the result to a gguf
452454
# https://github.com/instructlab/instructlab/issues/579
453455
# so we skip trying to test the result
454-
if [ "$LEGACYTRAIN" -eq 1 ]; then
456+
if [ "$FULL_TRAIN" -eq 1 ]; then
455457
# When you run this --
456458
# `ilab model convert` is only implemented for macOS with M-series chips for now
457459
#test_convert
458-
459-
test_serve trained "${DATA_HOME}/instructlab/checkpoints/model.gguf"
460+
461+
# when using full train, choose any GGUF from any of the checkpoints dirs
462+
model_dir=$(find "${DATA_HOME}"/instructlab/checkpoints/hf_format -name 'samples_*' | head -n 1)
463+
464+
test_serve trained "${model_dir}/pytorch_model-Q4_K_M.gguf"
460465
PID=$!
461466

462-
test_chat
467+
ilab model chat -qq --model "${model_dir}/pytorch_model-Q4_K_M.gguf" --endpoint-url http://localhost:8000/v1 'Say "Hello" and nothing else\n'
463468

464469
# Kill the serve process
465470
task Stopping the ilab model serve for trained model
@@ -508,21 +513,21 @@ wait_for_server() {
508513
usage() {
509514
echo "Usage: $0 [-m] [-h]"
510515
echo " -e Run model evaluation"
511-
echo " -T Use the 'full' training library rather than legacy training"
516+
echo " -q Use 4-bit-quant when training"
517+
echo " -a Use the 'full' training library rather than legacy training"
518+
echo " -s Run the simple training using the SFTTainer rather than the custom training loop"
512519
echo " -f Run the fullsize training instead of --4-bit-quant"
513520
echo " -F Use the 'full' SDG pipeline instead of the default 'simple' pipeline"
514521
echo " -h Show this help text"
515-
echo " -L Run legacy training with 4-bit quantization"
516522
echo " -m Run minimal configuration with lower number of instructions and training epochs (run quicker when you have no GPU)"
517523
echo " -M Use the mixtral model (4-bit quantized) instead of merlinite model (4-bit quantized)."
518524
echo " -P Run multi-phase training"
519-
echo " -T Use the 'full' training library rather than legacy training"
520525
echo " -v Use the vLLM backend for serving"
521526
}
522527

523528
# Process command line arguments
524529
task "Configuring ..."
525-
while getopts "eFhLmMPTv" opt; do
530+
while getopts "eFhqasfmMPv" opt; do
526531
case $opt in
527532
e)
528533
EVAL=1
@@ -536,10 +541,6 @@ while getopts "eFhLmMPTv" opt; do
536541
usage
537542
exit 0
538543
;;
539-
L)
540-
LEGACYTRAIN=1
541-
step "Running legacy training with 4-bit quantization."
542-
;;
543544
m)
544545
MINIMAL=1
545546
step "Running minimal configuration."
@@ -552,14 +553,26 @@ while getopts "eFhLmMPTv" opt; do
552553
PHASED_TRAINING=1
553554
step "Running multi-phase training."
554555
;;
555-
T)
556-
TRAIN_LIBRARY=1
557-
step "Running with training library."
558-
;;
559556
v)
560557
BACKEND=vllm
561558
step "Running with vLLM backend."
562559
;;
560+
q)
561+
FOUR_BIT_QUANT=1
562+
step "Running training using 4-bit-quantization."
563+
;;
564+
s)
565+
SIMPLE_TRAIN=1
566+
step "Running the simple training pipeline"
567+
;;
568+
f)
569+
FULL_TRAIN=1
570+
step "Running the full training pipeline"
571+
;;
572+
a)
573+
ACCELERATED_TRAIN=1
574+
step "Running using the training library"
575+
;;
563576
\?)
564577
echo "Invalid option: -$opt" >&2
565578
usage

src/instructlab/model/full_train.py

+16-11
Original file line numberDiff line numberDiff line change
@@ -7,22 +7,11 @@
77
import os
88

99
# Third Party
10-
from instructlab.training import config
11-
from instructlab.training import data_process as dp
12-
from instructlab.training import (
13-
multipack_sampler,
14-
token_dataset,
15-
tokenizer_utils,
16-
utils,
17-
)
1810
from instructlab_quantize import run_quantize
19-
from torch.utils.data import DataLoader
2011
from tqdm import tqdm
2112
from transformers import Adafactor, AutoModelForCausalLM
2213
import numpy as np
2314
import psutil
24-
import torch
25-
import torch.nn.functional as F
2615

2716
# First Party
2817
from instructlab.llamacpp import llamacpp_convert_to_gguf
@@ -37,6 +26,19 @@ def train(train_args, device):
3726
Dataloading functions imported from the training library.
3827
"""
3928

29+
# pylint: disable=no-name-in-module
30+
# Third Party
31+
from instructlab.training import config
32+
from instructlab.training import data_process as dp
33+
from instructlab.training import (
34+
multipack_sampler,
35+
token_dataset,
36+
tokenizer_utils,
37+
utils,
38+
)
39+
from torch.utils.data import DataLoader
40+
import torch
41+
4042
dp.main(
4143
config.DataProcessArgs(
4244
data_output_path=train_args.data_output_dir,
@@ -269,6 +271,9 @@ def train(train_args, device):
269271
def pad_collate_fn(batch, pad_token_id):
270272
lens = np.array([len(item["input_ids"]) for item in batch])
271273
max_len = max(lens)
274+
# Third Party
275+
import torch
276+
import torch.nn.functional as F
272277

273278
input_ids = torch.stack(
274279
[

0 commit comments

Comments
 (0)