diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py index a800c3770..f052a7ca7 100644 --- a/tests/e2e/e2e_utils.py +++ b/tests/e2e/e2e_utils.py @@ -62,6 +62,21 @@ def data_collator(batch): oneshot_kwargs["data_collator"] = data_collator + elif "calibration" in dataset_id: + + def data_collator(batch): + assert len(batch) == 1 + return { + key: ( + torch.tensor(value) + if key != "pixel_values" + else torch.tensor(value, dtype=torch.bfloat16).squeeze(0) + ) + for key, value in batch[0].items() + } + + oneshot_kwargs["data_collator"] = data_collator + oneshot_kwargs["model"] = loaded_model if recipe: oneshot_kwargs["recipe"] = recipe diff --git a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml index 3fde9d4c6..9199dd978 100644 --- a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml +++ b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml @@ -1,18 +1,22 @@ cadence: weekly -model: Qwen/Qwen2.5-VL-7B-Instruct -model_class: Qwen2_5_VLForConditionalGeneration +model: Qwen/Qwen3-VL-8B-Instruct +model_class: Qwen3VLForConditionalGeneration scheme: FP8_DYNAMIC recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_dynamic.yaml lmeval: model: "hf-multimodal" model_args: dtype: bfloat16 - add_bos_token: True convert_img_format: True - task: mmmu_val_literature + task: chartqa + apply_chat_template: True num_fewshot: 0 - batch_size: 8 - # dense model achieves accuracy of 0.9 +/ 0.0557 + batch_size: 100 + limit: 100 + # dense model achieves exact_match accuracy of 0.530 + # dense model achieves relaxed_accuracy of 0.780 + # dense model achieves anywhere_accuracy of 0.800 metrics: - acc,none: 0.8333 - acc_stderr,none: 0.0557 + exact_match,none: 0.530 + relaxed_accuracy,none: 0.780 + anywhere_accuracy,none: 0.810 diff --git a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml index 86c2f0e12..89145844e 100644 --- a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml +++ b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml @@ -1,20 +1,25 @@ cadence: "weekly" -model: Qwen/Qwen2.5-VL-7B-Instruct -model_class: Qwen2_5_VLForConditionalGeneration +model: Qwen/Qwen3-VL-8B-Instruct +model_class: Qwen3VLForConditionalGeneration scheme: INT8_dyn_per_token recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml -dataset_id: lmms-lab/flickr30k -dataset_split: "test[:512]" +dataset_id: neuralmagic/calibration +dataset_config: LLM +dataset_split: "train[:512]" lmeval: model: "hf-multimodal" model_args: dtype: bfloat16 - add_bos_token: True convert_img_format: True - task: mmmu_val_literature + task: chartqa + apply_chat_template: True num_fewshot: 0 - batch_size: 8 - # dense model achieves accuracy of 0.9 +/ 0.0557 + batch_size: 100 + limit: 100 + # dense model achieves exact_match accuracy of 0.520 + # dense model achieves relaxed_accuracy of 0.780 + # dense model achieves anywhere_accuracy of 0.800 metrics: - acc,none: 0.833 - acc_stderr,none: 0.0557 \ No newline at end of file + exact_match,none: 0.550 + relaxed_accuracy,none: 0.770 + anywhere_accuracy,none: 0.770 \ No newline at end of file diff --git a/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml b/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml index 37b162b37..121cc14bc 100644 --- a/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml +++ b/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml @@ -1,20 +1,25 @@ cadence: "weekly" -model: Qwen/Qwen2.5-VL-7B-Instruct -model_class: Qwen2_5_VLForConditionalGeneration +model: Qwen/Qwen3-VL-8B-Instruct +model_class: Qwen3VLForConditionalGeneration scheme: W4A16_actorder_weight recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml -dataset_id: lmms-lab/flickr30k -dataset_split: "test[:512]" +dataset_id: neuralmagic/calibration +dataset_config: LLM +dataset_split: "train[:512]" lmeval: model: "hf-multimodal" model_args: dtype: bfloat16 - add_bos_token: True convert_img_format: True - task: mmmu_val_literature + task: chartqa + apply_chat_template: True num_fewshot: 0 - batch_size: 8 - # dense model achieves accuracy of 0.9 +/ 0.0557 + batch_size: 100 + limit: 100 + # dense model achieves exact_match accuracy of 0.520 + # dense model achieves relaxed_accuracy of 0.780 + # dense model achieves anywhere_accuracy of 0.800 metrics: - acc,none: 0.8333 - acc_stderr,none: 0.0557 \ No newline at end of file + exact_match,none: 0.540 + relaxed_accuracy,none: 0.780 + anywhere_accuracy,none: 0.800 \ No newline at end of file diff --git a/tests/lmeval/test_lmeval.py b/tests/lmeval/test_lmeval.py index a44cd042f..da99782c6 100644 --- a/tests/lmeval/test_lmeval.py +++ b/tests/lmeval/test_lmeval.py @@ -25,6 +25,7 @@ class LmEvalConfig(BaseModel): num_fewshot: int = 5 limit: int = 1000 batch_size: int = 100 + apply_chat_template: bool = False # Recovery testing (default): compare against base model performance # Default threshold is 0.95 (retain ≥95% of base), can be overridden recovery_threshold: Union[float, dict] = 0.95 @@ -160,6 +161,7 @@ def _eval_base_model(self): num_fewshot=self.lmeval.num_fewshot, limit=self.lmeval.limit, device="cuda:0", + apply_chat_template=self.lmeval.apply_chat_template, batch_size=self.lmeval.batch_size, ) @@ -190,6 +192,7 @@ def _run_lm_eval(self): num_fewshot=self.lmeval.num_fewshot, limit=self.lmeval.limit, device="cuda:0", + apply_chat_template=self.lmeval.apply_chat_template, batch_size=self.lmeval.batch_size, ) diff --git a/tests/testing_utils.py b/tests/testing_utils.py index 4ce6a5de6..2cf69720b 100644 --- a/tests/testing_utils.py +++ b/tests/testing_utils.py @@ -285,6 +285,31 @@ def process(sample): "images": sample["image"], } + # "neuralmagic/calibration" + elif ds_name == "calibration": + + def process(example): + messages = [] + for message in example["messages"]: + messages.append( + { + "role": message["role"], + "content": [{"type": "text", "text": message["content"]}], + } + ) + + return processor.apply_chat_template( + messages, + return_tensors="pt", + padding=False, + truncation=True, + max_length=max_seq_length, + tokenize=True, + add_special_tokens=False, + return_dict=True, + add_generation_prompt=False, + ) + else: raise NotImplementedError(f"Cannot preprocess dataset {ds.info.dataset_name}")