Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions tests/e2e/e2e_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,21 @@ def data_collator(batch):

oneshot_kwargs["data_collator"] = data_collator

elif "calibration" in dataset_id:

def data_collator(batch):
assert len(batch) == 1
return {
key: (
torch.tensor(value)
if key != "pixel_values"
else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
)
for key, value in batch[0].items()
}

oneshot_kwargs["data_collator"] = data_collator

oneshot_kwargs["model"] = loaded_model
if recipe:
oneshot_kwargs["recipe"] = recipe
Expand Down
20 changes: 12 additions & 8 deletions tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
cadence: weekly
model: Qwen/Qwen2.5-VL-7B-Instruct
model_class: Qwen2_5_VLForConditionalGeneration
model: Qwen/Qwen3-VL-8B-Instruct
model_class: Qwen3VLForConditionalGeneration
scheme: FP8_DYNAMIC
recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_dynamic.yaml
lmeval:
model: "hf-multimodal"
model_args:
dtype: bfloat16
add_bos_token: True
convert_img_format: True
task: mmmu_val_literature
task: chartqa
apply_chat_template: True
num_fewshot: 0
batch_size: 8
# dense model achieves accuracy of 0.9 +/ 0.0557
batch_size: 100
limit: 100
# dense model achieves exact_match accuracy of 0.530
# dense model achieves relaxed_accuracy of 0.780
# dense model achieves anywhere_accuracy of 0.800
metrics:
acc,none: 0.8333
acc_stderr,none: 0.0557
exact_match,none: 0.530
relaxed_accuracy,none: 0.780
anywhere_accuracy,none: 0.810
25 changes: 15 additions & 10 deletions tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml
Original file line number Diff line number Diff line change
@@ -1,20 +1,25 @@
cadence: "weekly"
model: Qwen/Qwen2.5-VL-7B-Instruct
model_class: Qwen2_5_VLForConditionalGeneration
model: Qwen/Qwen3-VL-8B-Instruct
model_class: Qwen3VLForConditionalGeneration
scheme: INT8_dyn_per_token
recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
dataset_id: lmms-lab/flickr30k
dataset_split: "test[:512]"
dataset_id: neuralmagic/calibration
dataset_config: LLM
dataset_split: "train[:512]"
lmeval:
model: "hf-multimodal"
model_args:
dtype: bfloat16
add_bos_token: True
convert_img_format: True
task: mmmu_val_literature
task: chartqa
apply_chat_template: True
num_fewshot: 0
batch_size: 8
# dense model achieves accuracy of 0.9 +/ 0.0557
batch_size: 100
limit: 100
# dense model achieves exact_match accuracy of 0.520
# dense model achieves relaxed_accuracy of 0.780
# dense model achieves anywhere_accuracy of 0.800
metrics:
acc,none: 0.833
acc_stderr,none: 0.0557
exact_match,none: 0.550
relaxed_accuracy,none: 0.770
anywhere_accuracy,none: 0.770
25 changes: 15 additions & 10 deletions tests/lmeval/configs/vl_w4a16_actorder_weight.yaml
Original file line number Diff line number Diff line change
@@ -1,20 +1,25 @@
cadence: "weekly"
model: Qwen/Qwen2.5-VL-7B-Instruct
model_class: Qwen2_5_VLForConditionalGeneration
model: Qwen/Qwen3-VL-8B-Instruct
model_class: Qwen3VLForConditionalGeneration
scheme: W4A16_actorder_weight
recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
dataset_id: lmms-lab/flickr30k
dataset_split: "test[:512]"
dataset_id: neuralmagic/calibration
dataset_config: LLM
dataset_split: "train[:512]"
lmeval:
model: "hf-multimodal"
model_args:
dtype: bfloat16
add_bos_token: True
convert_img_format: True
task: mmmu_val_literature
task: chartqa
apply_chat_template: True
num_fewshot: 0
batch_size: 8
# dense model achieves accuracy of 0.9 +/ 0.0557
batch_size: 100
limit: 100
# dense model achieves exact_match accuracy of 0.520
# dense model achieves relaxed_accuracy of 0.780
# dense model achieves anywhere_accuracy of 0.800
metrics:
acc,none: 0.8333
acc_stderr,none: 0.0557
exact_match,none: 0.540
relaxed_accuracy,none: 0.780
anywhere_accuracy,none: 0.800
3 changes: 3 additions & 0 deletions tests/lmeval/test_lmeval.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class LmEvalConfig(BaseModel):
num_fewshot: int = 5
limit: int = 1000
batch_size: int = 100
apply_chat_template: bool = False
# Recovery testing (default): compare against base model performance
# Default threshold is 0.95 (retain ≥95% of base), can be overridden
recovery_threshold: Union[float, dict] = 0.95
Expand Down Expand Up @@ -160,6 +161,7 @@ def _eval_base_model(self):
num_fewshot=self.lmeval.num_fewshot,
limit=self.lmeval.limit,
device="cuda:0",
apply_chat_template=self.lmeval.apply_chat_template,
batch_size=self.lmeval.batch_size,
)

Expand Down Expand Up @@ -190,6 +192,7 @@ def _run_lm_eval(self):
num_fewshot=self.lmeval.num_fewshot,
limit=self.lmeval.limit,
device="cuda:0",
apply_chat_template=self.lmeval.apply_chat_template,
batch_size=self.lmeval.batch_size,
)

Expand Down
25 changes: 25 additions & 0 deletions tests/testing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,31 @@ def process(sample):
"images": sample["image"],
}

# "neuralmagic/calibration"
elif ds_name == "calibration":

def process(example):
messages = []
for message in example["messages"]:
messages.append(
{
"role": message["role"],
"content": [{"type": "text", "text": message["content"]}],
}
)

return processor.apply_chat_template(
messages,
return_tensors="pt",
padding=False,
truncation=True,
max_length=max_seq_length,
tokenize=True,
add_special_tokens=False,
return_dict=True,
add_generation_prompt=False,
)

else:
raise NotImplementedError(f"Cannot preprocess dataset {ds.info.dataset_name}")

Expand Down