diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py
index a800c3770..f052a7ca7 100644
--- a/tests/e2e/e2e_utils.py
+++ b/tests/e2e/e2e_utils.py
@@ -62,6 +62,21 @@ def data_collator(batch):
 
             oneshot_kwargs["data_collator"] = data_collator
 
+        elif "calibration" in dataset_id:
+
+            def data_collator(batch):
+                assert len(batch) == 1
+                return {
+                    key: (
+                        torch.tensor(value)
+                        if key != "pixel_values"
+                        else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
+                    )
+                    for key, value in batch[0].items()
+                }
+
+            oneshot_kwargs["data_collator"] = data_collator
+
     oneshot_kwargs["model"] = loaded_model
     if recipe:
         oneshot_kwargs["recipe"] = recipe
diff --git a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
index 3fde9d4c6..9199dd978 100644
--- a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
+++ b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
@@ -1,18 +1,22 @@
 cadence: weekly
-model: Qwen/Qwen2.5-VL-7B-Instruct
-model_class: Qwen2_5_VLForConditionalGeneration
+model: Qwen/Qwen3-VL-8B-Instruct
+model_class: Qwen3VLForConditionalGeneration
 scheme: FP8_DYNAMIC
 recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_dynamic.yaml
 lmeval:
   model: "hf-multimodal"
   model_args:
     dtype: bfloat16
-    add_bos_token: True
     convert_img_format: True
-  task: mmmu_val_literature
+  task: chartqa
+  apply_chat_template: True
   num_fewshot: 0
-  batch_size: 8
-  # dense model achieves accuracy of 0.9 +/ 0.0557
+  batch_size: 100
+  limit: 100
+  # dense model achieves exact_match accuracy of 0.530
+  # dense model achieves relaxed_accuracy of 0.780
+  # dense model achieves anywhere_accuracy of 0.800
   metrics:
-    acc,none: 0.8333
-    acc_stderr,none: 0.0557
+    exact_match,none: 0.530
+    relaxed_accuracy,none: 0.780
+    anywhere_accuracy,none: 0.810
diff --git a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml
index 86c2f0e12..89145844e 100644
--- a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml
+++ b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml
@@ -1,20 +1,25 @@
 cadence: "weekly"
-model: Qwen/Qwen2.5-VL-7B-Instruct
-model_class: Qwen2_5_VLForConditionalGeneration
+model: Qwen/Qwen3-VL-8B-Instruct
+model_class: Qwen3VLForConditionalGeneration
 scheme: INT8_dyn_per_token
 recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
-dataset_id: lmms-lab/flickr30k
-dataset_split: "test[:512]"
+dataset_id: neuralmagic/calibration
+dataset_config: LLM
+dataset_split: "train[:512]"
 lmeval:
   model: "hf-multimodal"
   model_args:
     dtype: bfloat16
-    add_bos_token: True
     convert_img_format: True
-  task: mmmu_val_literature
+  task: chartqa
+  apply_chat_template: True
   num_fewshot: 0
-  batch_size: 8
-  # dense model achieves accuracy of 0.9 +/ 0.0557
+  batch_size: 100
+  limit: 100
+  # dense model achieves exact_match accuracy of 0.520
+  # dense model achieves relaxed_accuracy of 0.780
+  # dense model achieves anywhere_accuracy of 0.800
   metrics:
-    acc,none: 0.833
-    acc_stderr,none: 0.0557
\ No newline at end of file
+    exact_match,none: 0.550
+    relaxed_accuracy,none: 0.770
+    anywhere_accuracy,none: 0.770
\ No newline at end of file
diff --git a/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml b/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml
index 37b162b37..121cc14bc 100644
--- a/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml
+++ b/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml
@@ -1,20 +1,25 @@
 cadence: "weekly"
-model: Qwen/Qwen2.5-VL-7B-Instruct
-model_class: Qwen2_5_VLForConditionalGeneration
+model: Qwen/Qwen3-VL-8B-Instruct
+model_class: Qwen3VLForConditionalGeneration
 scheme: W4A16_actorder_weight
 recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
-dataset_id: lmms-lab/flickr30k
-dataset_split: "test[:512]"
+dataset_id: neuralmagic/calibration
+dataset_config: LLM
+dataset_split: "train[:512]"
 lmeval:
   model: "hf-multimodal"
   model_args:
     dtype: bfloat16
-    add_bos_token: True
     convert_img_format: True
-  task: mmmu_val_literature
+  task: chartqa
+  apply_chat_template: True
   num_fewshot: 0
-  batch_size: 8
-  # dense model achieves accuracy of 0.9 +/ 0.0557
+  batch_size: 100
+  limit: 100
+  # dense model achieves exact_match accuracy of 0.520
+  # dense model achieves relaxed_accuracy of 0.780
+  # dense model achieves anywhere_accuracy of 0.800
   metrics:
-    acc,none: 0.8333
-    acc_stderr,none: 0.0557
\ No newline at end of file
+    exact_match,none: 0.540
+    relaxed_accuracy,none: 0.780
+    anywhere_accuracy,none: 0.800
\ No newline at end of file
diff --git a/tests/lmeval/test_lmeval.py b/tests/lmeval/test_lmeval.py
index a44cd042f..da99782c6 100644
--- a/tests/lmeval/test_lmeval.py
+++ b/tests/lmeval/test_lmeval.py
@@ -25,6 +25,7 @@ class LmEvalConfig(BaseModel):
     num_fewshot: int = 5
     limit: int = 1000
     batch_size: int = 100
+    apply_chat_template: bool = False
     # Recovery testing (default): compare against base model performance
     # Default threshold is 0.95 (retain ≥95% of base), can be overridden
     recovery_threshold: Union[float, dict] = 0.95
@@ -160,6 +161,7 @@ def _eval_base_model(self):
             num_fewshot=self.lmeval.num_fewshot,
             limit=self.lmeval.limit,
             device="cuda:0",
+            apply_chat_template=self.lmeval.apply_chat_template,
             batch_size=self.lmeval.batch_size,
         )
 
@@ -190,6 +192,7 @@ def _run_lm_eval(self):
             num_fewshot=self.lmeval.num_fewshot,
             limit=self.lmeval.limit,
             device="cuda:0",
+            apply_chat_template=self.lmeval.apply_chat_template,
             batch_size=self.lmeval.batch_size,
         )
 
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
index 4ce6a5de6..2cf69720b 100644
--- a/tests/testing_utils.py
+++ b/tests/testing_utils.py
@@ -285,6 +285,31 @@ def process(sample):
                 "images": sample["image"],
             }
 
+    # "neuralmagic/calibration"
+    elif ds_name == "calibration":
+
+        def process(example):
+            messages = []
+            for message in example["messages"]:
+                messages.append(
+                    {
+                        "role": message["role"],
+                        "content": [{"type": "text", "text": message["content"]}],
+                    }
+                )
+
+            return processor.apply_chat_template(
+                messages,
+                return_tensors="pt",
+                padding=False,
+                truncation=True,
+                max_length=max_seq_length,
+                tokenize=True,
+                add_special_tokens=False,
+                return_dict=True,
+                add_generation_prompt=False,
+            )
+
     else:
         raise NotImplementedError(f"Cannot preprocess dataset {ds.info.dataset_name}")